In [1]:
# Standard imports
import pandas as pd
import numpy as np
import logomaker
import time

%matplotlib inline
import matplotlib.pyplot as plt

# Import marginalization function
from utils import x_to_ct_df

In [2]:
# Load in vivo data
df = pd.read_csv('data/in_vivo_data.csv.gz', compression='gzip')

# Rename columns and samples for added convenience
df.rename(columns={'DNA count':'DNA', 'sample':'replicate'}, inplace=True)
df['treatment'] = df['treatment'].map({'background':'none', 
                                       'PPP+P':'Rpp', 
                                       'OH+P':'PNK', 
                                       'All_end':'both'})
df['replicate'] = df['replicate'].map({'KS112':'rep1', 
                                       'KS113':'rep2', 
                                       'KS114':'rep3'})
df.columns  = [int(col[1:]) if col[0]=='p' else col for col in df.columns]
df.head()

Unnamed: 0,N10,treatment,replicate,DNA,4,5,6,7,8,9,10
0,AAAAAAAAAA,both,rep1,18,0,0,0,0,0,0,0
1,AAAAAAAAAC,both,rep1,41,0,0,0,0,0,0,0
2,AAAAAAAAAG,both,rep1,38,0,0,0,0,0,0,0
3,AAAAAAAAAT,both,rep1,30,0,0,0,0,0,0,0
4,AAAAAAAACA,both,rep1,14,0,0,0,0,0,0,0


In [3]:
# Define end positions
poss = list(range(4,11))
L = 10

# Pivot dataframe 
ct_df = df.pivot(index='N10', 
                 columns=['treatment','replicate'], 
                 values=poss)

# Reorder columns
ct_df = ct_df.reorder_levels([2,1,0], axis=1)
sorted_cols = ct_df.columns.sortlevel()[0]
ct_df = ct_df[sorted_cols]

# Preview ct_df
ct_df.head()

replicate,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,...,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3
treatment,PNK,PNK,PNK,PNK,PNK,PNK,PNK,Rpp,Rpp,Rpp,...,both,both,both,none,none,none,none,none,none,none
Unnamed: 0_level_2,4,5,6,7,8,9,10,4,5,6,...,8,9,10,4,5,6,7,8,9,10
N10,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
AAAAAAAAAA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAC,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAG,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAACA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Load pre-computed efficiencies
eps = pd.read_csv('csv_results/efficiencies.csv', index_col=[0,1])['efficiency']
eps.index = eps.index.reorder_levels([1,0])
eps

replicate  sample
rep1       none      1.000000
rep2       none      1.000000
rep3       none      1.000000
rep1       Rpp       0.910393
rep2       Rpp       0.922886
rep3       Rpp       0.987480
rep1       PNK       0.755147
rep2       PNK       0.751011
rep3       PNK       0.753237
rep1       both      0.911045
rep2       both      0.896461
rep3       both      0.922206
Name: efficiency, dtype: float64

In [5]:
# Create list of dinucs and dinuc display names
dinucs = ['NN'] + [b1+b2 for b1 in 'ACGT' for b2 in 'ACGT']
#dinucs = dinucs[:2]  # For testing
primers = [(d[0]+'p'+d[1]).replace('T','U') for d in dinucs]

# Create dataframe to hold counts by position
replicates = ['rep1','rep2','rep3']
oh_stat_df = pd.DataFrame(columns=['primer', 'replicate']+poss[:-1])
for primer in primers:
    for rep in replicates:
        oh_stat_df = oh_stat_df.append({'primer':primer, 'replicate':rep}, ignore_index=True)
oh_stat_df = oh_stat_df.fillna(0)
oh_stat_df = oh_stat_df.groupby(['primer','replicate']).first()
#oh_stat_df

In [6]:
# Let list of all N10 promoters
all_seqs = ct_df.index.values

# Draw logos within figure
t = time.time()

# Iterate over replicates
for k, rep in enumerate(replicates):
    
    # Feedback
    print(f'{rep}: ', end='')
    
    # Create dataframe with efficiency-corrected OH counts for each promoter
    OH_df = 0.5*(ct_df[(rep,'both')]/eps[(rep,'both')] 
                 + ct_df[(rep,'PNK')]/eps[(rep,'PNK')]
                 - ct_df[(rep,'Rpp')]/eps[(rep,'Rpp')]
                 - ct_df[(rep,'none')]/eps[(rep,'none')])
    OH_df.tail()

    # Create dataframe with efficiency-corrected PPP counts for each promoter
    PPP_df = 0.5*(ct_df[(rep,'both')]/eps[(rep,'both')] 
                 - ct_df[(rep,'PNK')]/eps[(rep,'PNK')]
                 + ct_df[(rep,'Rpp')]/eps[(rep,'Rpp')]
                 - ct_df[(rep,'none')]/eps[(rep,'none')])
    PPP_df.tail()
    
    # Compute total corrected counts 
    total_ct = OH_df[poss[:-1]].sum(axis=1) + PPP_df[poss[1:]].sum(axis=1)
        
    # Iterate over dinucs
    for i, dinuc in enumerate(dinucs):
        
        # Feedback
        primer = primers[i]
        print(f'{primer} ', end='')

        # Iterate over positions
        for j, pos in enumerate(poss[:-1]):

            # Create filename
            file_name = f'csv_logos/invivo_p{pos}_{primer}_{rep}_logo.csv'
            
            # Get OH corrected counts
            OH_ct = OH_df[pos]

            # Get indices for promoters that match the desired dinucleotide
            if dinuc == 'NN':
                seqs = all_seqs
            else:
                seqs = [seq for seq in all_seqs if seq[(pos-1):(pos+1)]==dinuc]         

            # Add OH stats
            oh_stat_df.loc[(primer, rep),pos] = OH_df.loc[seqs,pos].sum()
                
            # Marginalize and regularize OH_ct to get fg_counts_df
            fg_counts_df = x_to_ct_df(x=seqs, ct=OH_ct[seqs])
            fg_counts_df.index = range(1,L+1)
            fg_counts_df = fg_counts_df.where(fg_counts_df>0,0)+1
            zero_count_poss = fg_counts_df.index[(fg_counts_df==1).any(axis=1)]

            # Marginalize total_ct to get bg_counts_df
            bg_counts_df = x_to_ct_df(x=seqs, ct=total_ct[seqs])
            bg_counts_df.index = range(1,L+1)
            bg_counts_df = bg_counts_df.where(bg_counts_df>0,0)+2*(len(poss)-1)

            # Compute enrichment ratio and center values
            logo_df = np.log2(fg_counts_df/bg_counts_df)
            logo_df.loc[zero_count_poss,:] = 0.0
            logo_df = logomaker.transform_matrix(logo_df, center_values=True)

            # Modify logo index
            logo_df.index = range(1,L+1)

            # Set logo values to zero at A-site and P-site (determined by pos)
            logo_df.loc[(pos,pos+1),:] = 0
            
            # Save logo
            logo_df.to_csv(file_name)
            
    # Newline feedback
    print('')
    
# Show execution time
print(f'Done! Execution time:{time.time()-t:.2f} sec.')

rep1: NpN ApA ApC ApG ApU CpA CpC CpG CpU GpA GpC GpG GpU UpA UpC UpG UpU 
rep2: NpN ApA ApC ApG ApU CpA CpC CpG CpU GpA GpC GpG GpU UpA UpC UpG UpU 
rep3: NpN ApA ApC ApG ApU CpA CpC CpG CpU GpA GpC GpG GpU UpA UpC UpG UpU 
Done! Execution time:115.32 sec.


In [7]:
# Save OH counts table
out_df = oh_stat_df.copy()
out_df.reset_index(inplace=True)
out_df.to_csv('csv_results/oh_counts_table.csv')
out_df.head()

Unnamed: 0,primer,replicate,4,5,6,7,8,9
0,ApA,rep1,0.153971,270.469849,3206.209324,2702.402852,1096.586942,555.130115
1,ApA,rep2,54.45368,245.168443,4051.189959,4812.025962,3521.058243,645.429137
2,ApA,rep3,40.758572,204.025771,3231.493927,2682.40106,1355.395298,27.347419
3,ApC,rep1,17.88195,158.577889,3049.709273,5011.971234,129.377103,-124.505645
4,ApC,rep2,37.747765,299.890478,3748.77525,7070.63493,2676.158825,196.929395
