In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logomaker
import time
import glob
%matplotlib inline

# Import marginalization function
from utils import x_to_ct_df

# For multiindex slicing
idx = pd.IndexSlice

In [2]:
# Set column values
poss = list(range(4,11))
concs = [40, 160, 640]
replicates = ['rep1', 'rep2', 'rep3']
treatments = ['Rpp','mockRpp','PNK','mockPNK','DNA']

In [3]:
### Load in vitro data

# For mapping file convensions to analysis naming convensions
treatment_map_dict = {'PPP':'Rpp', 'PPPmock':'mockRpp', 'OH':'PNK', 'OHmock':'mockPNK'}
replicate_map_dict = {'repA':'rep1', 'repB':'rep2', 'repC':'rep3'}
cols_dict = {f'tnmP{i:d}':i for i in poss}
cols_dict['DNA count'] = 'DNA'

# Initialize df
df = pd.DataFrame()

# Iterate over cocentrations
for conc in concs:
    
    # Iterate over replicates
    for rep in replicate_map_dict.keys():
        
        # Iterate over treatments
        for treatment in treatment_map_dict.keys():
            
            # Find file
            pattern = f'data/in_vitro/*_{conc:d}uM_{treatment}_{rep}.csv.gz'
            file_names = glob.glob(pattern)
            assert len(file_names)==1, \
                f'Should only find one file matching pattern; found {len(file_names)}'
            file_name = file_names[0]
            
            # Load data
            print(f'Loading {file_name}...')
            tmp_df = pd.read_csv(file_name, index_col=0)
            tmp_df.rename(columns=cols_dict, inplace=True)
            
            # Add iteration info and remove other info
            tmp_df['replicate'] = replicate_map_dict[rep]
            tmp_df['treatment'] = treatment_map_dict[treatment]
            tmp_df['conc'] = conc
            tmp_df = tmp_df[['conc', 'replicate', 'treatment', 'DNA'] + poss]
            tmp_df.index.name='N10'
            tmp_df.reset_index(inplace=True)
            
            # Pivot dataframe
            pivot_df = tmp_df.pivot(index='N10', 
                                    columns=['conc','treatment','replicate'], 
                                    values=poss)
        
            # Merge information into df
            if len(df.columns)==0:
                df = pivot_df
            else:
                df = pd.merge(left=df, right=pivot_df, left_index=True, right_index=True, how='outer')

# Clean up ct_df
df = df.reorder_levels([1,3,2,0], axis=1)
sorted_cols = df.columns.sortlevel()[0]
df = df[sorted_cols]
            
# Feedback
print('Done!')
df.head()

Loading data/in_vitro/KS89A_40uM_PPP_repA.csv.gz...
Loading data/in_vitro/KS89A_40uM_PPPmock_repA.csv.gz...
Loading data/in_vitro/KS89A_40uM_OH_repA.csv.gz...
Loading data/in_vitro/KS89A_40uM_OHmock_repA.csv.gz...
Loading data/in_vitro/KS90A_40uM_PPP_repB.csv.gz...
Loading data/in_vitro/KS90A_40uM_PPPmock_repB.csv.gz...
Loading data/in_vitro/KS90A_40uM_OH_repB.csv.gz...
Loading data/in_vitro/KS90A_40uM_OHmock_repB.csv.gz...
Loading data/in_vitro/KS91A_40uM_PPP_repC.csv.gz...
Loading data/in_vitro/KS91A_40uM_PPPmock_repC.csv.gz...
Loading data/in_vitro/KS91A_40uM_OH_repC.csv.gz...
Loading data/in_vitro/KS91A_40uM_OHmock_repC.csv.gz...
Loading data/in_vitro/KS92A_160uM_PPP_repA.csv.gz...
Loading data/in_vitro/KS92A_160uM_PPPmock_repA.csv.gz...
Loading data/in_vitro/KS92A_160uM_OH_repA.csv.gz...
Loading data/in_vitro/KS92A_160uM_OHmock_repA.csv.gz...
Loading data/in_vitro/KS93A_160uM_PPP_repB.csv.gz...
Loading data/in_vitro/KS93A_160uM_PPPmock_repB.csv.gz...
Loading data/in_vitro/KS93A_16

conc,40,40,40,40,40,40,40,40,40,40,...,640,640,640,640,640,640,640,640,640,640
replicate,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,...,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3
treatment,PNK,PNK,PNK,PNK,PNK,PNK,PNK,Rpp,Rpp,Rpp,...,mockPNK,mockPNK,mockPNK,mockRpp,mockRpp,mockRpp,mockRpp,mockRpp,mockRpp,mockRpp
Unnamed: 0_level_3,4,5,6,7,8,9,10,4,5,6,...,8,9,10,4,5,6,7,8,9,10
N10,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
AAAAAAAAAA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAC,0,0,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAG,0,0,0,0,0,0,0,4,1,1,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAAAT,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
AAAAAAAACA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# For all replicates
for rep in replicates:
    
    # For all concentrations
    for conc in concs:
        
        ### Draw figure
        L = 10
        num_poss = len(poss)

        # Get list of all N10 promoters
        all_seqs = df.index.values

        # Draw logos within figure
        t = time.time()
        for i, pos in enumerate(poss[:-1]):

            # Define filename
            file_name = f'csv_logos/invitro_p{pos}_{conc}uM_{rep}_logo.csv'

            # Print position for feedback
            print(f'computing {file_name}...')

            # Get sequences that have 'TA' at primer site
            seqs = [seq for seq in all_seqs if seq[(pos-1):(pos+1)]=='TA'] 

            # Compute total_ct
            panel_df = df.loc[seqs, idx[conc, rep, :, :]].droplevel([0,1], axis=1)
            OH_df = panel_df['PNK'] - panel_df['mockPNK']
            PPP_df = panel_df['Rpp'] - panel_df['mockRpp']

            # Get total_ct
            total_ct = OH_df[poss[:-1]].sum(axis=1) + PPP_df[poss[1:]].sum(axis=1)

            # Get OH_ct
            OH_ct = OH_df[pos]      

            # Marginalize and regularize OH_ct to get fg_counts_df
            fg_counts_df = x_to_ct_df(x=seqs, ct=OH_ct)
            fg_counts_df.index = range(1,L+1)
            fg_counts_df = fg_counts_df.where(fg_counts_df>0,0)+1
            zero_count_poss = fg_counts_df.index[(fg_counts_df==1).any(axis=1)]

            # Marginalize total_ct to get bg_counts_df
            bg_counts_df = x_to_ct_df(x=seqs, ct=total_ct)
            bg_counts_df.index = range(1,L+1)
            bg_counts_df = bg_counts_df.where(bg_counts_df>0,0)+2*num_poss

            # Compute enrichment ratio and center values
            logo_df = np.log2(fg_counts_df/bg_counts_df)
            logo_df.loc[zero_count_poss,:] = 0.0
            logo_df = logomaker.transform_matrix(logo_df, center_values=True)

            # Modify logo index
            logo_df.index = range(1,L+1)

            # Set logo values to zero at A-site and P-site (determined by pos)
            logo_df.loc[(pos,pos+1),:] = 0

            # Save logo
            logo_df.to_csv(file_name)

# Show execution time
print(f'Done! Execution time:{time.time()-t:.2f} sec.')

computing csv_logos/invitro_p4_40uM_rep1_logo.csv...
computing csv_logos/invitro_p5_40uM_rep1_logo.csv...
computing csv_logos/invitro_p6_40uM_rep1_logo.csv...
computing csv_logos/invitro_p7_40uM_rep1_logo.csv...
computing csv_logos/invitro_p8_40uM_rep1_logo.csv...
computing csv_logos/invitro_p9_40uM_rep1_logo.csv...
computing csv_logos/invitro_p4_160uM_rep1_logo.csv...
computing csv_logos/invitro_p5_160uM_rep1_logo.csv...
computing csv_logos/invitro_p6_160uM_rep1_logo.csv...
computing csv_logos/invitro_p7_160uM_rep1_logo.csv...
computing csv_logos/invitro_p8_160uM_rep1_logo.csv...
computing csv_logos/invitro_p9_160uM_rep1_logo.csv...
computing csv_logos/invitro_p4_640uM_rep1_logo.csv...
computing csv_logos/invitro_p5_640uM_rep1_logo.csv...
computing csv_logos/invitro_p6_640uM_rep1_logo.csv...
computing csv_logos/invitro_p7_640uM_rep1_logo.csv...
computing csv_logos/invitro_p8_640uM_rep1_logo.csv...
computing csv_logos/invitro_p9_640uM_rep1_logo.csv...
computing csv_logos/invitro_p4_40u