In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logomaker
import re
import pdb
import glob
%matplotlib inline

In [2]:
# Load chromosomal count data
treatments = ['both', 'Rpp', 'PNK', 'none']
replicates = ['rep1','rep2','rep3']
L = 10
positions = list(range(-5,6))
col_tuples = [(treatment, rep, pos) 
                  for treatment in treatments
                  for rep in replicates
                  for pos in positions]
cols = pd.MultiIndex.from_tuples(col_tuples)
ct_df = pd.read_excel('data/chromosomal_data.xlsx', header=[0,1,2], index_col=0)
ix = [isinstance(i,str) for i in ct_df.index]
ct_df = ct_df[ix]
ct_df.columns = pd.MultiIndex.from_tuples(col_tuples)
ct_df.index = [seq.upper()[:L] for seq in ct_df.index]
ct_df

Unnamed: 0_level_0,both,both,both,both,both,both,both,both,both,both,...,none,none,none,none,none,none,none,none,none,none
Unnamed: 0_level_1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,rep1,...,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3,rep3
Unnamed: 0_level_2,-5,-4,-3,-2,-1,0,1,2,3,4,...,-4,-3,-2,-1,0,1,2,3,4,5
TAGTTATCGA,0.0,0.0,0.0,4.0,1962.0,183.0,3.0,7.0,19.0,2.0,...,0.0,0.0,4.0,2.0,222.0,4.0,4.0,19.0,5.0,5.0
TCTTTATGGT,0.0,0.0,0.0,0.0,26.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,2.0
GTGTTATAAA,0.0,0.0,0.0,0.0,25.0,28.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
CTTCTATGAA,2.0,1.0,5.0,18.0,585.0,531.0,15.0,11.0,5.0,30.0,...,1.0,0.0,1.0,5.0,213.0,10.0,3.0,3.0,24.0,48.0
AAGTTAATTC,0.0,0.0,0.0,1.0,66.0,107.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,63.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCTAGAGT,0.0,3.0,60.0,240.0,499.0,4693.0,201.0,28.0,12.0,15.0,...,0.0,2.0,97.0,139.0,2474.0,36.0,6.0,9.0,15.0,5.0
TGAGTACACG,3.0,11.0,11.0,57.0,15.0,957.0,7.0,4.0,1.0,0.0,...,8.0,7.0,33.0,13.0,527.0,10.0,0.0,0.0,0.0,0.0
ACTATAAAGT,0.0,0.0,13.0,36.0,10.0,1145.0,8.0,3.0,4.0,10.0,...,0.0,1.0,9.0,0.0,123.0,4.0,3.0,4.0,13.0,0.0
GAGTTACACC,0.0,1.0,16.0,103.0,187.0,53979.0,4346.0,241.0,41.0,127.0,...,1.0,15.0,82.0,136.0,37022.0,2960.0,157.0,23.0,118.0,480.0


In [3]:
# Load pre-computed efficiencies and take median values across replicates
eps_series = pd.read_csv('csv_results/efficiencies.csv', index_col=[0,1])['efficiency']
eps_df = pd.DataFrame(eps_series).reset_index().groupby('sample').median()
eps_df.index.name = 'treatment'
eps_series = eps_df['efficiency']
eps_series

treatment
PNK     0.753237
Rpp     0.922886
both    0.911045
none    1.000000
Name: efficiency, dtype: float64

In [4]:
# Compute OH@-1/total ratio for each replicate, than take median across replicates.

# Initialize df
df = pd.DataFrame(index=ct_df.index, columns=replicates)

# Iterate over replicates
for rep in replicates:
    
    # Compute OH counts
    OH_series = ct_df[('both',rep,-1)] - ct_df[('Rpp',rep,-1)] 
    OH_series = OH_series.where(OH_series>0, 0) + 1
    
    # Compute total counts
    total_series = ct_df[('both',rep)].sum(axis=1) + L
    
    # Compute ratio
    df[rep] = OH_series/total_series

# Take mean ratios across replicates
ratio_series = df.mean(axis=1)
ratio_series

  result = self._run_cell(
  result = self._run_cell(
  result = self._run_cell(


TAGTTATCGA    0.878277
TCTTTATGGT    0.562302
GTGTTATAAA    0.463116
CTTCTATGAA    0.453441
AAGTTAATTC    0.420636
                ...   
TTTCTAGAGT    0.003879
TGAGTACACG    0.005471
ACTATAAAGT    0.002435
GAGTTACACC    0.000922
TATGTACAAT    0.031752
Length: 93, dtype: float64

In [5]:
# Compute and save logo

# First create empty df
seqs = ratio_series.index
weights = ratio_series.values
L = len(seqs[0])
alphabet = ['A','C','G','T']
logo_df = pd.DataFrame(index=range(1,L+1),
                       columns=alphabet,
                       data=0)

# Compute average OH ratio observed for each base at each position
for l in range(L):
    for c in alphabet:
        n_lc = 0
        w_lc = 0
        for weight, seq in zip(weights,seqs):
            if seq[l]==c:
                w_lc += weight
                n_lc += 1
            logo_df.loc[l+1,c] = np.log2(w_lc / n_lc) if n_lc > 0 else 0.0
        
# Center logo
logo_df = logomaker.transform_matrix(logo_df, center_values=True)

# Put dinuc positions to zero
dinuc_positions = (5,6)
logo_df.loc[dinuc_positions,:] = 0

# Save logo dataframe
file_name = 'csv_logos/chromosomal_logo.csv'
print(f'Writing {file_name}: ', end='')
logo_df.to_csv(file_name)

# Show data frame
logo_df

Writing csv_logos/chromosomal_logo.csv: 

Unnamed: 0_level_0,A,C,G,T
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.07201,-0.113353,-0.247283,0.288626
2,0.086867,-0.150612,-0.202837,0.266582
3,-0.27087,-0.369612,0.409536,0.230946
4,-0.320032,0.290069,-0.778858,0.808821
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.344054,-0.596682,-0.685226,0.937855
8,-0.352858,0.272243,0.090917,-0.010302
9,0.28984,-0.262266,0.172963,-0.200537
10,0.482761,-0.159797,-0.238179,-0.084785


The height of each character c at position l represents the log2 average of the OH/total ratio computed across chromosomal sequences having character c at position l. 