In [1]:
import sys
from collections import defaultdict
import multiprocessing
from multiprocessing import Pool
from functools import partial
import timeit

import ribopy
from ribopy import Ribo
from ribopy.core.get_gadgets import get_region_boundaries, get_reference_names

import numpy as np
import pandas as pd

sys.path.insert(0, '../snp')
from ref_lib.Fasta import FastaEntry, FastaFile

In [2]:
mouse_transcriptome_file = "../../mouse_itp_reference/transcriptome/varnt_masked_and_filtered_mouse_transcriptome.fa.gz"
human_transcriptome_file = "../../../itp/human_itp_reference/transcriptome/appris_human_v2_after_filtering.fa.gz"

mouse_ribo_file = "../../mouse-itp_v5.ribo"
human_ribo_file = "../../../itp/human-itp_v4.ribo"

LEN_MIN = 29
LEN_MAX = 35

In [3]:
mouse_sequences = dict()

with FastaFile(mouse_transcriptome_file) as mouse_transcriptome:
    for entry in mouse_transcriptome:
        mouse_sequences[ entry.header ] = entry.sequence



In [67]:
human_sequences = dict()

with FastaFile(human_transcriptome_file) as human_transcriptome:
    for entry in human_transcriptome:
        human_sequences[ entry.header ] = entry.sequence


In [4]:
mouse_ribo = Ribo(mouse_ribo_file)

In [64]:
human_ribo = Ribo(human_ribo_file)

In [5]:
mouse_region_boundaries = get_region_boundaries(mouse_ribo._handle)
mouse_gene_names        = get_reference_names(mouse_ribo._handle)

mouse_ref_name_and_boundaries = zip(mouse_gene_names, mouse_region_boundaries)

mouse_cds_boundaries = dict()

for e in list(mouse_ref_name_and_boundaries):
    mouse_cds_boundaries[e[0]] = e[1][1]

In [66]:
human_region_boundaries = get_region_boundaries(human_ribo._handle)
human_gene_names        = get_reference_names(human_ribo._handle)

human_ref_name_and_boundaries = zip(human_gene_names, human_region_boundaries)

human_cds_boundaries = dict()

for e in list(human_ref_name_and_boundaries):
    human_cds_boundaries[e[0]] = e[1][1]

In [6]:
mouse_ribo = Ribo(mouse_ribo_file)

In [7]:
def get_coverage_at_length(this_ribo, experiment, rpf_length):
    return this_ribo.get_coverage(experiment  = experiment,
                                  range_lower = rpf_length, 
                                  range_upper = rpf_length)

def make_coverage_dict_of_experiment(this_ribo, experiment, len_min, len_max):
    result_dict = defaultdict(dict)
    
    for i in range(len_min, len_max + 1):
        result_dict[i] = get_coverage_at_length(this_ribo, experiment, i)
        
    return result_dict

In [8]:
MII_1_coverages = make_coverage_dict_of_experiment(mouse_ribo, 
                                                   experiment = "20210301-ITP-MII-25-B", 
                                                   len_min    = LEN_MIN, 
                                                   len_max    = LEN_MAX )

In [9]:
list(MII_1_coverages.keys())

[29, 30, 31, 32, 33, 34, 35]

In [10]:
def init_frame_triplet():
    return [0, 0, 0]

def count_nucleotides(coverage_dict, 
                      cds_annotation_dict, 
                      sequence_dict, 
                      rpf_len,
                      left_span = 2, right_span = 1 ):
    """
    Given a footprint length, this functio creates a table where each row
    comes from nucleotide sequences
    and the three columns correspond to the frames 0,1,2
    
    coverage_dict:
       gene_identifier -> coverage vector
       
    cds_annotation_dict:
       gene_identifier -> [cds_start, cds_end]
       
    sequence_dict:
       gene_identifier -> gene_sequence
       
    rpf_len:
       read length, ribosome protected footpront length
       
    left_span:  the last nucleotides before the 3' end of the read 
    
    right_span: the first nucleotides  the 3' end of the read
    
    """
    
    nuc_counts = defaultdict(init_frame_triplet)
    
    
    for gene, coverage in coverage_dict.items():
        for position in range( cds_annotation_dict[gene][0], cds_annotation_dict[gene][1] ):
            if coverage[position] == 0:
                continue
                
            this_sequence     = sequence_dict[gene][position + rpf_len - left_span: \
                                                    position + rpf_len + right_span]
            
            if "N" in this_sequence:
                continue
            
            relative_position = position - cds_annotation_dict[gene][0]
            
            this_frame = relative_position % 3
            
            nuc_counts[this_sequence][this_frame] += coverage[position]
        
    return nuc_counts

In [11]:
def count_nucleotides_parallel( coverage_dict, 
                                cds_annotation_dict, 
                                sequence_dict, 
                                rpf_min, 
                                rpf_max,
                                processes = 4,
                                left_span = 2, right_span = 1 ):
    
    """
    Wrapper for the function `count_nucleotides`.
    
    
    It takes a range of ribosome protected footprint lengths and runs count_nucletides
    function on these individual lengths.
    
    """
    
    rpf_range = list(range(rpf_min, rpf_max + 1))
    
    # Without this global, we have an error at the Pool.map step.
    global f
    
    def f(x):
       return count_nucleotides( coverage_dict[x], cds_annotation_dict, sequence_dict, rpf_len = x,
                      left_span = left_span, right_span = right_span ) 
    
    with Pool(processes) as p:
        mapped_list = p.map(f, rpf_range)
        
    return(mapped_list)
        


In [12]:
def _adjust_individual_frames(frame_dict):
    """
    Picks the maximum from each frame and moves the reads to the maximally picked frame.
    """
    
    result = dict()
    
    for this_seq, triplet in frame_dict.items():
        
        max_arg = np.argmax(triplet)
        result[this_seq] = [0, 0, 0]
        result[this_seq][max_arg] = np.sum(triplet)
        
    return result



def adjust_frames(list_of_frame_dict, processes = 4):
    
    with Pool(processes) as p:
        adjusted_frames = p.map(_adjust_individual_frames, list_of_frame_dict)
        
    return(adjusted_frames)


In [13]:
def _p_site_adjust(df):
    """
    Bring maximal counts in the frames to the frame 0 for each length.
    This is done via a cyclic shift so that the maximal is at frame 0.
    """
    adjusted_tuples = []

    for r,v in df.iterrows():
        max_index      = np.argmax(v)
        adjusted_tuple = [v[ (i + max_index) % 3] for i in range(3)  ]
        adjusted_tuples.append(adjusted_tuple)

    mydf = pd.DataFrame(adjusted_tuples)
    
    return mydf
    


def adjust_p_sites(nucleotide_counts_list):
    summed_frames_per_length = []
    
    
    # Sum the values for the frames 0,1,2
    # collect them in a dataframe df_s
    for nuc_counts in nucleotide_counts_list:
        df = pd.DataFrame(nuc_counts)
        summed_frames_per_length.append( df.sum(axis=1) )
        
    summed_frames_df       = pd.DataFrame(summed_frames_per_length)
    p_site_adjusted_frames = _p_site_adjust(summed_frames_df)
    
    return p_site_adjusted_frames



def calculate_periodicity_percentage(df):
    frame_sums = df.sum()
    
    return 100 * ( frame_sums / np.sum(frame_sums) )





In [14]:
experiments = mouse_ribo.experiments

nucleotide_counts_2_1 = dict()
nucleotide_counts_0_1 = dict()
nucleotide_counts_0_2 = dict()

for e in experiments:
    print(e)
    this_coverage = make_coverage_dict_of_experiment(
                            mouse_ribo, 
                            experiment = e, 
                            len_min    = LEN_MIN, 
                            len_max    = LEN_MAX )
    
    nucleotide_counts_2_1[e] = count_nucleotides_parallel( 
                                    coverage_dict       = this_coverage, 
                                    cds_annotation_dict = mouse_cds_boundaries ,
                                    sequence_dict       = mouse_sequences, 
                                    rpf_min = LEN_MIN, rpf_max = LEN_MAX,
                                    left_span           = 2, 
                                    right_span          = 1 )
    
    nucleotide_counts_0_1[e] = count_nucleotides_parallel( 
                                    coverage_dict       = this_coverage, 
                                    cds_annotation_dict = mouse_cds_boundaries ,
                                    sequence_dict       = mouse_sequences, 
                                    rpf_min = LEN_MIN, rpf_max = LEN_MAX,
                                    left_span           = 0, 
                                    right_span          = 1 )   

    nucleotide_counts_0_2[e] = count_nucleotides_parallel( 
                                    coverage_dict       = this_coverage, 
                                    cds_annotation_dict = mouse_cds_boundaries ,
                                    sequence_dict       = mouse_sequences, 
                                    rpf_min = LEN_MIN, rpf_max = LEN_MAX,
                                    left_span           = 0, 
                                    right_span          = 2 )

20210301-ITP-MII-25-B
20210301-ITP-MII-50-A
20210301-ITP-MII-50-B
20210318-ITP-MII-50-B
20210513-ITP-1cell-cross-50-A
20210513-ITP-1cell-cross-50-B
20210513-ITP-1cell-cross-50-C
20210513-ITP-1cell-cross-50-D
20210513-ITP-1cell-cross-50-E
20210513-ITP-2cell-cross-50-B
20210513-ITP-2cell-cross-50-C
20210513-ITP-2cell-cross-50-F
20210513-ITP-4cell-cross-50-B
20210513-ITP-4cell-cross-50-C
20210513-ITP-4cell-cross-50-D
20210513-ITP-8cell-cross-50-A
20210513-ITP-8cell-cross-50-B
20210513-ITP-8cell-cross-50-C
20210513-ITP-8cell-cross-50-D
20210614-ITP-GV-50-A
20210614-ITP-GV-50-B
20210614-ITP-GV-50-C
20210614-ITP-GV-50-E
20210614-ITP-GV-50-F
20210614-ITP-MII-50-D


In [42]:
one_nuc_adjusted_frames = adjust_frames(nucleotide_counts_0_1[mouse_ribo.experiments[22]]  )
one_nuc_y               = adjust_p_sites(one_nuc_adjusted_frames)
calculate_periodicity_percentage(one_nuc_y)

0    66.805899
1    16.141993
2    17.052108
dtype: float64

In [56]:
mouse_adjusted_frame_percentages = dict()
mouse_raw_frame_percentages      = dict()

for e in mouse_ribo.experiments:
    adjusted_frames            = adjust_frames(nucleotide_counts_0_1[e]  )
    p_adj_frames               = adjust_p_sites(adjusted_frames)
    p_raw_frames               = adjust_p_sites(nucleotide_counts_0_1[e])
    
    mouse_raw_frame_percentages[e]      = calculate_periodicity_percentage(p_raw_frames)
    mouse_adjusted_frame_percentages[e] = calculate_periodicity_percentage(p_adj_frames)    

In [61]:
mouse_adjusted_frame_percentages_df = pd.DataFrame(mouse_adjusted_frame_percentages).transpose()
mouse_raw_frame_percentages_df = pd.DataFrame(mouse_raw_frame_percentages).transpose()

In [62]:
mouse_adjusted_frame_percentages_df

Unnamed: 0,0,1,2
20210301-ITP-MII-25-B,62.171025,18.16353,19.665445
20210301-ITP-MII-50-A,65.422901,14.426416,20.150683
20210301-ITP-MII-50-B,68.801977,14.477091,16.720932
20210318-ITP-MII-50-B,61.859729,17.256521,20.88375
20210513-ITP-1cell-cross-50-A,67.158603,8.635235,24.206162
20210513-ITP-1cell-cross-50-B,62.973949,14.166165,22.859886
20210513-ITP-1cell-cross-50-C,68.048846,9.605475,22.345679
20210513-ITP-1cell-cross-50-D,62.46488,16.427329,21.107791
20210513-ITP-1cell-cross-50-E,61.230001,15.0735,23.6965
20210513-ITP-2cell-cross-50-B,72.482944,8.926726,18.59033


In [72]:
mouse_adjusted_frame_percentages_df.to_csv("mouse_adjusted_frame_percentages.csv")

In [63]:
mouse_raw_frame_percentages_df

Unnamed: 0,0,1,2
20210301-ITP-MII-25-B,37.501033,32.366963,30.132003
20210301-ITP-MII-50-A,37.062525,32.116927,30.820548
20210301-ITP-MII-50-B,36.976306,30.456614,32.56708
20210318-ITP-MII-50-B,36.382408,31.976048,31.641544
20210513-ITP-1cell-cross-50-A,37.201092,30.21819,32.580718
20210513-ITP-1cell-cross-50-B,36.296678,30.961797,32.741524
20210513-ITP-1cell-cross-50-C,37.155126,32.555019,30.289855
20210513-ITP-1cell-cross-50-D,36.710074,32.112117,31.177809
20210513-ITP-1cell-cross-50-E,36.871599,31.515471,31.612929
20210513-ITP-2cell-cross-50-B,37.670929,30.402552,31.926519


In [71]:
mouse_raw_frame_percentages_df.to_csv("mouse_raw_frame_percentages.csv")

--------------------------------------------------------

## HUMAN DATA

In [81]:
human_ribo.experiments

('20191203-Kit-10M-Monosome-1',
 '20191203-Kit-10M-Monosome-2',
 '20191203-Kit-10M-Monosome-3',
 '20201104-ITP-100-5mM-50-1',
 '20201209-ITP-100-5mM-6',
 '20210131-ITP-100-5mM-50_diluted-1')

In [73]:
human_experiments = human_ribo.experiments


human_nucleotide_counts_0_1 = dict()


for e in human_experiments:
    print(e)
    this_coverage = make_coverage_dict_of_experiment(
                            human_ribo, 
                            experiment = e, 
                            len_min    = LEN_MIN, 
                            len_max    = LEN_MAX )
    
    
    human_nucleotide_counts_0_1[e] = count_nucleotides_parallel( 
                                    coverage_dict       = this_coverage, 
                                    cds_annotation_dict = human_cds_boundaries ,
                                    sequence_dict       = human_sequences, 
                                    rpf_min = LEN_MIN, rpf_max = LEN_MAX,
                                    left_span           = 0, 
                                    right_span          = 1 )   



20191203-Kit-10M-Monosome-1
20191203-Kit-10M-Monosome-2
20191203-Kit-10M-Monosome-3
20201104-ITP-100-5mM-50-1
20201209-ITP-100-5mM-6
20210131-ITP-100-5mM-50_diluted-1


In [74]:
human_adjusted_frame_percentages = dict()
human_raw_frame_percentages      = dict()

for e in human_ribo.experiments:
    adjusted_frames            = adjust_frames(human_nucleotide_counts_0_1[e]  )
    p_adj_frames               = adjust_p_sites(adjusted_frames)
    p_raw_frames               = adjust_p_sites(human_nucleotide_counts_0_1[e])
    
    human_raw_frame_percentages[e]      = calculate_periodicity_percentage(p_raw_frames)
    human_adjusted_frame_percentages[e] = calculate_periodicity_percentage(p_adj_frames) 

In [79]:
human_raw_frame_percentages_df = pd.DataFrame(human_raw_frame_percentages).transpose()
human_raw_frame_percentages_df.to_csv("human_raw_frame_percentages.csv")
human_raw_frame_percentages_df

Unnamed: 0,0,1,2
20191203-Kit-10M-Monosome-1,39.006724,31.846913,29.146363
20191203-Kit-10M-Monosome-2,38.693376,32.024315,29.28231
20191203-Kit-10M-Monosome-3,38.623567,32.480144,28.896289
20201104-ITP-100-5mM-50-1,38.641237,32.225242,29.133521
20201209-ITP-100-5mM-6,37.139046,31.723705,31.137249
20210131-ITP-100-5mM-50_diluted-1,37.707792,31.300839,30.991369


In [80]:
human_adjusted_frame_percentages_df = pd.DataFrame(human_adjusted_frame_percentages).transpose()
human_adjusted_frame_percentages_df.to_csv("human_adjusted_frame_percentages.csv")
human_adjusted_frame_percentages_df

Unnamed: 0,0,1,2
20191203-Kit-10M-Monosome-1,72.230854,11.828664,15.940481
20191203-Kit-10M-Monosome-2,72.89124,11.484843,15.623917
20191203-Kit-10M-Monosome-3,72.742513,11.697062,15.560425
20201104-ITP-100-5mM-50-1,70.082638,9.634706,20.282655
20201209-ITP-100-5mM-6,61.387099,19.283025,19.329876
20210131-ITP-100-5mM-50_diluted-1,67.161393,11.709822,21.128785
