In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
from numpy.random import Generator, PCG64
rng_pg = Generator(PCG64())
from scipy.optimize import newton, minimize, fsolve
from scipy.optimize import nnls

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib as mpl
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("notebook")
%matplotlib inline

In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.curve_fitting import loss_func_c1, loss_func_l1, loss_func_l2, hl_bsl_fit, bootstrap_hl, bootstrap_hl_fast

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
stats_dir = 'stats_collection'
plotting_dir = 'plotting'

Notebook is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/projects/charge_half-life
Repo is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq


### Settings

In [3]:
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                stats_dir = stats_dir,
                plotting_dir = plotting_dir)

In [4]:
dir_dict

{'NBdir': '/Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/projects/charge_half-life',
 'data_dir': 'data',
 'stats_dir': 'stats_collection',
 'plotting_dir': 'plotting'}

In [5]:
# Get charge dataframe:
plot_obj = TRNA_plot(dir_dict, sample_df=sample_df, \
                     stats_fnam='ALL_stats_aggregate.csv', \
                     overwrite_dir=False)
plot_obj.get_charge_df()



Folder exists and overwrite set to false... Doing nothing.


In [6]:
plot_obj.charge_df.head(3)

Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,tRNA_annotation,tRNA_anno_short,tRNA_annotation_len,unique_annotation,codon,anticodon,AA_codon,amino_acid,single_codon,single_aa,mito_codon,Ecoli_ctr,AA_letter,A_count,C_count,count,charge,RPM
0,0m_1,0m,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,eColiLys-TTT-1-1,76,True,AAA,TTT,eColiLys-AAA,eColiLys,True,True,False,True,K,0,20,20,0.0,464.597658
1,0m_1,0m,1,l1Sp,Homo_sapiens_mito_tRNA-Ala-TGC,Ala-TGC,72,True,GCA,TGC,Ala-GCA,Ala,True,True,True,False,A,1,48,49,2.040816,1138.264263
2,0m_1,0m,1,l1Sp,Homo_sapiens_mito_tRNA-Arg-TCG,Arg-TCG,68,True,CGA,TCG,Arg-CGA,Arg,True,True,True,False,R,9,98,107,8.411215,2485.597473


In [7]:
plot_obj.all_stats.head(3)

Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,tRNA_annotation,tRNA_annotation_len,unique_annotation,5p_cover,align_3p_nt,codon,anticodon,amino_acid,count,AA_codon,tRNA_anno_short,single_codon,single_aa,mito_codon,Ecoli_ctr,AA_letter
0,0m_1,0m,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,C,AAA,TTT,eColiLys,14,eColiLys-AAA,eColiLys-TTT-1-1,True,True,False,True,K
1,0m_1,0m,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,True,C,AAA,TTT,eColiLys,6,eColiLys-AAA,eColiLys-TTT-1-1,True,True,False,True,K
2,0m_1,0m,1,l1Sp,Homo_sapiens_mito_tRNA-Ala-TGC,72,True,False,C,GCA,TGC,Ala,33,Ala-GCA,Ala-TGC,True,True,True,False,A


In [8]:
plot_obj.charge_filt['tr'].head(3)

Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,tRNA_annotation,tRNA_anno_short,tRNA_annotation_len,codon,anticodon,AA_codon,amino_acid,AA_letter,mito_codon,Ecoli_ctr,count,A_count,C_count,RPM,charge
0,0m_1,0m,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,eColiLys-TTT-1-1,76,AAA,TTT,eColiLys-AAA,eColiLys,K,False,True,20,0,20,464.597658,0.0
1,0m_1,0m,1,l1Sp,Homo_sapiens_mito_tRNA-Ala-TGC,Ala-TGC,72,GCA,TGC,Ala-GCA,Ala,A,True,False,49,1,48,1138.264263,2.040816
2,0m_1,0m,1,l1Sp,Homo_sapiens_mito_tRNA-Arg-TCG,Arg-TCG,68,CGA,TCG,Arg-CGA,Arg,R,True,False,107,9,98,2485.597473,8.411215


In [9]:
set(plot_obj.charge_filt['tr']['tRNA_annotation'])

{'Escherichia_coli_str_K_12_substr_MG1655_tRNA-eColiLys-TTT-1-1',
 'Homo_sapiens_mito_tRNA-Ala-TGC',
 'Homo_sapiens_mito_tRNA-Arg-TCG',
 'Homo_sapiens_mito_tRNA-Asn-GTT',
 'Homo_sapiens_mito_tRNA-Asp-GTC',
 'Homo_sapiens_mito_tRNA-Cys-GCA',
 'Homo_sapiens_mito_tRNA-Gln-TTG',
 'Homo_sapiens_mito_tRNA-Glu-TTC',
 'Homo_sapiens_mito_tRNA-Gly-TCC',
 'Homo_sapiens_mito_tRNA-His-GTG',
 'Homo_sapiens_mito_tRNA-Ile-GAT',
 'Homo_sapiens_mito_tRNA-Leu1-TAG',
 'Homo_sapiens_mito_tRNA-Leu2-TAA',
 'Homo_sapiens_mito_tRNA-Lys-TTT',
 'Homo_sapiens_mito_tRNA-Met-CAT',
 'Homo_sapiens_mito_tRNA-Phe-GAA',
 'Homo_sapiens_mito_tRNA-Pro-TGG',
 'Homo_sapiens_mito_tRNA-Ser2-TGA',
 'Homo_sapiens_mito_tRNA-Thr-TGT',
 'Homo_sapiens_mito_tRNA-Trp-TCA',
 'Homo_sapiens_mito_tRNA-Tyr-GTA',
 'Homo_sapiens_mito_tRNA-Val-TAC',
 'Homo_sapiens_tRNA-Ala-AGC-1-1',
 'Homo_sapiens_tRNA-Ala-AGC-10-1',
 'Homo_sapiens_tRNA-Ala-AGC-11-1',
 'Homo_sapiens_tRNA-Ala-AGC-12-1',
 'Homo_sapiens_tRNA-Ala-AGC-13-1',
 'Homo_sapiens_tRNA-Al

In [10]:
charge_df = sample_df.loc[:, ['sample_name_unique', 'hue_value', 'hue_order']].merge(plot_obj.charge_filt['tr'], on='sample_name_unique')

In [11]:
# Enforce minimum read count:
min_count = 4

tann2snu = dict()
for snu, tann, ectr, cnt in zip(charge_df['sample_name_unique'], \
                               charge_df['tRNA_annotation'], \
                               charge_df['Ecoli_ctr'], \
                               charge_df['count']):
    if ectr or cnt < min_count:
        continue
    
    if tann in tann2snu:
        tann2snu[tann].add(snu)
    else:
        tann2snu[tann] = set()
        tann2snu[tann].add(snu)

# Filter based on minimum:
all_snu = set(charge_df['sample_name_unique'])
tann_sele = set()
for tann in tann2snu:
    if tann2snu[tann] == all_snu:
        tann_sele.add(tann)

mask = charge_df['tRNA_annotation'].isin(tann_sele)
filt_df = charge_df[mask].copy()

# Convert time to minutes:
time_list = list()
for sn in filt_df['sample_name']:
    if 'NoOx' in sn:
        time_list.append(None)
    elif 'm' in sn:
        time_list.append(int(sn[:-1]))
    elif 'h' in sn:
        time_list.append(int(sn[:-1])*60)
    else:
        raise Exception('???')
filt_df['Time'] = time_list
hl_df = filt_df[~filt_df['Time'].isna()].copy()

In [12]:
#### Do this to make data ###
# Delete when real data is available
hl_df = hl_df.sort_values(['tRNA_annotation', 'Time'])
ch_list = np.array([96.97580645161288, 95.96774193548386, 92.13709677419355, 88.30645161290322, 64.51612903225806, 42.33870967741935, 18.548387096774185, 2.2177419354838785]*len(tann_sele))
hl_df['charge'] = np.random.normal(ch_list, ch_list/100+0.5)

hl_df_cp = hl_df.copy()
hl_df_cp['replicate'] = 2
hl_df_cp['charge'] = np.random.normal(ch_list, ch_list/100+0.5)
hl_df = pd.concat((hl_df, hl_df_cp))

hl_df_cp['replicate'] = 3
hl_df_cp['charge'] = np.random.normal(ch_list, ch_list/100+0.5)
hl_df = pd.concat((hl_df, hl_df_cp))

hl_df_cp['replicate'] = 4
hl_df_cp['charge'] = np.random.normal(ch_list, ch_list/100+0.5)
hl_df = pd.concat((hl_df, hl_df_cp))


In [13]:
for tann in tann_sele:
    sample_mask = hl_df['tRNA_annotation'] == tann
    hl_spl = hl_df[sample_mask].copy()
    
    # hl_p_est, hl_ci = bootstrap_hl(hl_spl)

Half-life

$$
N(t) = N_0 \left( \frac{1}{2} \right) ^\frac{t}{t_{1/2}} + N_{\infty}
$$


$$
log_2(N(t) - N_{\infty}) = log_2(N_0) - \frac{t}{t_{1/2}}
$$

$$
log_2(N(t) - N_{\infty}) = log_2(N_0) - t \frac{1}{t_{1/2}}
$$

$$
t_{1/2} = \frac{t}{log_2(N_0) - log_2(N(t) - N_{\infty})}
$$

In [60]:
np.tile(np.array([1,2,3]), 1)

array([1, 2, 3])

In [51]:
mat = np.arange(0, 12).reshape((3,4))

In [52]:
mat

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [56]:
fmat = mat.flatten(order='F')
fmat

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

In [59]:
np.mean(fmat.reshape((3,4), order='F'), axis=1)

array([1.5, 5.5, 9.5])

In [14]:
bootstrap_hl(hl_spl, Ndraws=100000, BFGS_loss_func=loss_func_l2, lstsq=True)

(401.96860096690966, array([363.83086104, 427.31680073]))

In [36]:
bootstrap_hl(hl_spl, Ndraws=100000, BFGS_loss_func=loss_func_l2, lstsq=True, minus_N_bz=2)

(401.96860096690966, array([364.58660202, 426.3681481 ]))

In [64]:
bootstrap_hl(hl_spl, Ndraws=1000, BFGS_loss_func=loss_func_l2, lstsq=False)

(396.5764154133678, array([376.24284431, 410.123602  ]))

In [27]:
bootstrap_hl2(hl_spl, Ndraws=1000, BFGS_loss_func=loss_func_l2, lstsq=False)

(396.5764154133678, array([385.61548928, 405.28895984]))

In [16]:
bootstrap_hl_fast(hl_spl, Ndraws=100000, BFGS_loss_func=loss_func_l2)

(401.96860096690966, array([363.83278506, 427.32375062]))

In [17]:
bootstrap_hl_fast(hl_spl, Ndraws=100000, BFGS_loss_func=loss_func_c1)

(395.47721446302836, array([355.36040785, 421.64433102]))

In [62]:
bootstrap_hl_fast(hl_spl, Ndraws=100000, BFGS_loss_func=loss_func_c1, minus_N_bz=0)

(395.47721446302836, array([369.87870146, 417.67771578]))