# EXTRACT SNP COUNTS OF DETECTED GENES


We extract the gene lists that are snp-expressed in 4 and 8 cell stages.

The definition of expressed (or detected) genes come from our prop-test. 
See the definition of `perform_prop_test` for details

In [4]:
import gzip
import pandas as pd
import numpy as np
import scipy as sp
import os

from multiprocessing import Pool
from matplotlib import pyplot as plt
from functools import reduce

import pickle

# We use rpy2 for the prop.test and padjust in R
import rpy2
from rpy2 import robjects
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr


from IPython.display import display, HTML

In [5]:
%load_ext rpy2.ipython
%matplotlib inline

plt.rcParams['figure.figsize'] = [16, 8]

## Constants

In [6]:
# The values of the confidence interval coming from the proptest
# must be less than this value
PROPTEST_ERROR_THRESHOLD = 0.05

# After adjusting the p-values of the proptest
# we pick the FDR values less than this threshold
FDR_THRESHOLD = 0.2

## Input & Output Folders

In [7]:
INPUT_FOLDER  = "../snp"
OUTPUT_FOLDER = "."


In [8]:
# RNA_SEQ EXPERIMENTS
# Note that we excluded the experiment 20210607-RNAseq-4cell-cross-B

rnaseq_one_cell_exps = [
"20210607-RNAseq-1cell-cross-A",
"20210607-RNAseq-1cell-cross-B",
"20210607-RNAseq-1cell-cross-C",
"20210607-RNAseq-1cell-cross-D"]

rnaseq_two_cell_exps = [
"20210607-RNAseq-2cell-cross-A",
"20210607-RNAseq-2cell-cross-B",
"20210607-RNAseq-2cell-cross-C",
"20210607-RNAseq-2cell-cross-D"
]

rnaseq_four_cell_exps = [
"20210607-RNAseq-4cell-cross-A",
#"20210607-RNAseq-4cell-cross-B",
"20210607-RNAseq-4cell-cross-C"
]

rnaseq_eight_cell_exps = [
"20210607-RNAseq-8cell-cross-A",
"20210607-RNAseq-8cell-cross-B",
"20210607-RNAseq-8cell-cross-C",
"20210607-RNAseq-8cell-cross-D"
]

rnaseq_gv_epxs = [
"20210607-RNAseq-GV-A",
"20210607-RNAseq-GV-B",
"20210607-RNAseq-GV-C",
"20210607-RNAseq-GV-D"
]

rnaseq_mii_exps = [
"20210607-RNAseq-MII-A",
"20210607-RNAseq-MII-B",
"20210607-RNAseq-MII-C",
"20210607-RNAseq-MII-D"
]

rnaseq_all_exps = rnaseq_one_cell_exps + rnaseq_two_cell_exps + \
                  rnaseq_four_cell_exps + rnaseq_eight_cell_exps +\
                  rnaseq_gv_epxs + rnaseq_mii_exps

In [9]:
## RIBOSOME PROFILING EXPERIMENTS

riboseq_one_cell_exps = [
"20210513-ITP-1cell-cross-50-A",
"20210513-ITP-1cell-cross-50-B",
"20210513-ITP-1cell-cross-50-C",
"20210513-ITP-1cell-cross-50-D",
"20210513-ITP-1cell-cross-50-E"]
    
riboseq_two_cell_exps = [
"20210513-ITP-2cell-cross-50-B",
"20210513-ITP-2cell-cross-50-C",
"20210513-ITP-2cell-cross-50-F",
]

riboseq_four_cell_exps = [
"20210513-ITP-4cell-cross-50-B",
"20210513-ITP-4cell-cross-50-C",
"20210513-ITP-4cell-cross-50-D",
]
    
riboseq_eigth_cell_exps = [
"20210513-ITP-8cell-cross-50-A",
"20210513-ITP-8cell-cross-50-B",
"20210513-ITP-8cell-cross-50-C",
"20210513-ITP-8cell-cross-50-D",
]
    
    
riboseq_mii_exps = [
"20210301-ITP-MII-25-B",
"20210301-ITP-MII-50-A",
"20210301-ITP-MII-50-B",
"20210318-ITP-MII-50-B",
#"20210614-ITP-MII-50-A",
#"20210614-ITP-MII-50-B",
#"20210614-ITP-MII-50-C",
"20210614-ITP-MII-50-D",
#"20210614-ITP-MII-50-E",
#"20210614-ITP-MII-50-F",
]    
    
    
riboseq_gv_epxs = [
"20210614-ITP-GV-50-A",
"20210614-ITP-GV-50-B",
"20210614-ITP-GV-50-C",
"20210614-ITP-GV-50-D",
"20210614-ITP-GV-50-E",
"20210614-ITP-GV-50-F",
]

riboseq_all_exps = riboseq_one_cell_exps  + riboseq_two_cell_exps + \
                   riboseq_four_cell_exps + riboseq_eigth_cell_exps + \
                   riboseq_mii_exps       + riboseq_gv_epxs

In [10]:
## FUNCTIONS TO READ SNP FILES

In [11]:
def extract_CDS(name_str):
    """
    Returns CDS boundaries from the transcript header
    """
    
    contents = name_str.split("|")
    
    for c in contents:
        if c.startswith("CDS"):
            boundaries = c.split(":")[1].split("-")
            return (int(boundaries[0]), int(boundaries[1]))
    
    print(name_str)
    raise ValueError 
    
    
    
############################################################
    
    
    
def get_df(snp_count_file):
    """
    Reads SNP cont file into a dataframe
    """
    
    count_df         = pd.read_csv(snp_count_file, sep = "\t")
    transcript_names = list( map( lambda x: x.split("|")[4], count_df["transcript"]) )

    boundaries       = list( map(extract_CDS, count_df["transcript"])  )

    start_positions  = list( map(lambda x: x[0], boundaries ) )
    stop_positions   = list( map(lambda x: x[1], boundaries ) )

    snp_total = count_df["A"] + count_df["C"] + count_df["G"] +  count_df["T"]

    count_df["transcript"] = transcript_names
    count_df["snp_total"]  = snp_total
    count_df["CDS_start"]  = start_positions
    count_df["CDS_stop"]   = stop_positions


    count_df.sort_values(by = ['snp_total'], ascending=False, inplace=True)
    
    return count_df    

#################################################################

def get_ref_alt_counts(count_df):
    """
    Extracts the reference (maternal) and alternative (paternal) 
    counts from the dataframes
    """
    
    ref_count   = 0
    alt_count   = 0
    total_count = 0
    
    cds_ref_count   = 0
    cds_alt_count   = 0
    cds_total_count = 0

    for index, row in count_df.iterrows():
        total_count += row["A"] + row["C"] + row["G"] + row["T"]
        ref_count   += row[ row["REF"] ]
        alt_count   += row[ row["ALT"] ]
        
        if row["position"] >= row["CDS_start"] and\
           row["position"] <= row["CDS_stop"]:
                cds_total_count += row["A"] + row["C"] + row["G"] + row["T"]
                cds_ref_count   += row[ row["REF"] ]
                cds_alt_count   += row[ row["ALT"] ]            

        
    return {"ref_count":     ref_count,     "alt_count": alt_count,         "total_count": total_count,
            "cds_ref_count": cds_ref_count, "cds_alt_count": cds_alt_count, "cds_total_count": cds_total_count}

## RNA-Seq SNP Calls

We read SNP calls from RNA-Seq into dataframes.

In [12]:
rnaseq_experiment_dfs = dict()

for e in rnaseq_all_exps:
    this_file = os.path.join( INPUT_FOLDER, "snp_counts_rnaseq", e + ".tsv.gz")
    rnaseq_experiment_dfs[e] = get_df(this_file)
    rnaseq_experiment_dfs[e]["experiment"] = e
    
rnaseq_reference_alt_counts = dict()

with Pool(4) as p:
    rnaseq_counts_array = p.map( get_ref_alt_counts, [ rnaseq_experiment_dfs[e] for e in rnaseq_all_exps ] )
    
for i in range(len(rnaseq_all_exps)):
    rnaseq_reference_alt_counts[rnaseq_all_exps[i]] = rnaseq_counts_array[i]
    


In [13]:
rnaseq_snp_count_df  = pd.DataFrame.from_dict(rnaseq_reference_alt_counts).transpose()
rnseq_snp_count_file = os.path.join(OUTPUT_FOLDER, "rnaseq_experimentwise_snp_counts.csv")
rnaseq_snp_count_df.to_csv(rnseq_snp_count_file)

rnaseq_snp_count_df[:2]

Unnamed: 0,ref_count,alt_count,total_count,cds_ref_count,cds_alt_count,cds_total_count
20210607-RNAseq-1cell-cross-A,179914,314,180540,179914,314,180540
20210607-RNAseq-1cell-cross-B,180976,373,181700,180976,373,181700


In [14]:
rnaseq_experiment_dfs["20210607-RNAseq-4cell-cross-C"][:5]

Unnamed: 0,transcript,position,REF,ALT,A,C,G,T,snp_total,CDS_start,CDS_stop,experiment
68514,Rps19-203,303,C,T,1,521,1,616,1139,298,735,20210607-RNAseq-4cell-cross-C
66330,Dppa3-201,290,A,C,691,403,1,0,1095,114,566,20210607-RNAseq-4cell-cross-C
4478,Rps26-201,299,C,T,0,505,0,517,1022,270,617,20210607-RNAseq-4cell-cross-C
66329,Dppa3-201,263,C,T,0,509,0,437,946,114,566,20210607-RNAseq-4cell-cross-C
23498,Rpl15-202,275,A,G,441,0,474,0,915,120,734,20210607-RNAseq-4cell-cross-C


## Ribosome Profiling SNP Calls

In [15]:
riboseq_experiment_dfs = dict()

for e in riboseq_all_exps:
    this_file = os.path.join(INPUT_FOLDER,"snp_counts_riboseq", e + ".tsv.gz")
    riboseq_experiment_dfs[e] = get_df(this_file)
    riboseq_experiment_dfs[e]["experiment"] = e
    

riboseq_reference_alt_counts = dict()

with Pool(4) as p:
    riboseq_counts_array = p.map( get_ref_alt_counts, [ riboseq_experiment_dfs[e] for e in riboseq_all_exps ] )
    
for i in range(len(riboseq_all_exps)):
    riboseq_reference_alt_counts[riboseq_all_exps[i]] = riboseq_counts_array[i]



In [16]:
#for e in riboseq_all_exps:
#    print(e)
#    print("Alt: {:.1f}    Ref: {:.1f}".format( riboseq_reference_alt_percentages[e]["cds_alt_percentage"],
# 

## SNP Data at Highest Detail

We combine all of the SNP data into one dataframe. 
For each experiment, transcript and SNP position there is a separate row with SNP information.
Note that, in our context, reference (REF) corresponds to maternal strain and alternative (ALT) corresponds to paternal strain.

In [17]:
def find_maternal_paternal_snps(this_df):
    maternal_counts = list()
    paternal_counts = list()
    
    
    for i, contents in this_df.iterrows():
        this_maternal = contents[ contents["REF"] ]
        this_paternal = contents[ contents["ALT"] ]
        paternal_counts.append(this_paternal)
        maternal_counts.append(this_maternal)
        
    this_df["maternal"] = maternal_counts
    this_df["paternal"] = paternal_counts
    
    return this_df



In [18]:
# Find ribosome profiling maternal / poaternal counts 

with Pool(4) as p:
    riboseq_detailed_count_dfs = p.map( find_maternal_paternal_snps, list(riboseq_experiment_dfs.values()) )

In [19]:
# Find rna-seq maternal / poaternal counts 

with Pool(4) as p:
    rnaseq_detailed_count_dfs = p.map( find_maternal_paternal_snps, list(rnaseq_experiment_dfs.values()) )

In [20]:
def combine_snp_dfs_into_on_df(df_list):
    
    res_columns = ["experiment", "transcript", "position", "paternal",\
                   "maternal", "REF", "ALT", "A", "C", "G", "T"]
    
    result_df   =  df_list[0].copy()
    
    for i in range(1, len(df_list)):
        result_df = result_df.append( df_list[i] )
        
    return result_df[res_columns]

In [21]:
combined_riboseq_snp_df = combine_snp_dfs_into_on_df(riboseq_detailed_count_dfs)
combined_rnaseq_snp_df  = combine_snp_dfs_into_on_df(rnaseq_detailed_count_dfs)

In [22]:
high_res_riboseq_snp_file = os.path.join(OUTPUT_FOLDER, "riboseq_detailed_snps.csv.gz")
combined_riboseq_snp_df.to_csv( high_res_riboseq_snp_file, index = False )

high_res_rnaseq_snp_file = os.path.join(OUTPUT_FOLDER, "rnaseq_detailed_snps.csv.gz")
combined_rnaseq_snp_df.to_csv( high_res_rnaseq_snp_file, index = False )

In [23]:
combined_riboseq_snp_df

Unnamed: 0,experiment,transcript,position,paternal,maternal,REF,ALT,A,C,G,T
21047,20210513-ITP-1cell-cross-50-A,Spin1-201,848,0,139,C,G,0,139,0,0
22356,20210513-ITP-1cell-cross-50-A,Zbed3-202,471,0,124,T,C,0,0,0,124
22358,20210513-ITP-1cell-cross-50-A,Zbed3-202,707,0,100,A,G,100,0,0,0
23711,20210513-ITP-1cell-cross-50-A,E330034G19Rik-203,257,0,74,A,C,74,0,0,0
22354,20210513-ITP-1cell-cross-50-A,Zbed3-202,320,4,52,T,G,4,0,4,52
...,...,...,...,...,...,...,...,...,...,...,...
29124,20210614-ITP-GV-50-F,Slc4a8-201,3090,0,0,A,G,0,0,0,0
29122,20210614-ITP-GV-50-F,Slc4a8-201,2048,0,0,T,A,0,0,0,0
29121,20210614-ITP-GV-50-F,Slc4a8-201,1791,0,0,G,A,0,0,0,0
29120,20210614-ITP-GV-50-F,Slc4a8-201,1494,0,0,G,A,0,0,0,0


## Aggregating SNP Counts

We sum SNPs accross replicates

In [24]:
def add_two_snp_dfs(df_1, df_2):
    """
    This is the helper function to add dataframes
    
    Adds two dataframes and produces one that is compatible with other SNP dataframes
    (Have the same column names)
    """
    
    index_col_name = "trans_pos"
    
    zipped_df1_transcripts_df1_positions = zip( df_1["transcript"], df_1["position"] )
    zipped_df2_transcripts_df2_positions = zip( df_2["transcript"], df_2["position"] )
    
    df1_index_vec = list( map( lambda x: str(x[0]) + "_" + str(x[1]), zipped_df1_transcripts_df1_positions ) )
    df2_index_vec = list( map( lambda x: str(x[0]) + "_" + str(x[1]), zipped_df2_transcripts_df2_positions ) )
    
    df_1_indexed = df_1.copy()
    df_2_indexed = df_2.copy()
    
    df_1_indexed[index_col_name] = df1_index_vec
    df_2_indexed[index_col_name] = df2_index_vec
    
    df_1_indexed.set_index(index_col_name, inplace = True)
    df_2_indexed.set_index(index_col_name, inplace = True)
    
    merged_df = df_1_indexed.merge( df_2_indexed, on = ["trans_pos"] )
    
    df_dict = {
        #"trnas_pos":  merged_df.index,
        "transcript": merged_df["transcript_x"],
        "position":   merged_df["position_x"],
        "REF":        merged_df["REF_x"],
        "ALT":        merged_df["ALT_x"],
        "A":          np.add(merged_df["A_x"] , merged_df["A_y"]),
        "C":          np.add(merged_df["C_x"] , merged_df["C_y"]),
        "G":          np.add(merged_df["G_x"] , merged_df["G_y"]),
        "T":          np.add(merged_df["T_x"] , merged_df["T_y"]),
        "snp_total":  np.add(merged_df["snp_total_x"] , merged_df["snp_total_y"]),
        "CDS_start":  merged_df["CDS_start_x"],
        "CDS_stop":  merged_df["CDS_stop_x"]     
    }
    
    result_df = pd.DataFrame.from_dict(df_dict)
    
    return (result_df)


##############################################################################


def add_snp_dfs(df_list):
    """
    THIS IS THE MAIN FUNCTION TO CALL!!!
    Adds a list of dataframes containing SNPS
    """
    result_df =  reduce( lambda x, y : add_two_snp_dfs(x,y), df_list  )
    result_df.sort_values(by = ['snp_total'], ascending=False, inplace=True)
    return result_df

In [25]:
add_two_snp_dfs( riboseq_experiment_dfs[riboseq_one_cell_exps[0]], riboseq_experiment_dfs[riboseq_one_cell_exps[1]] )

Unnamed: 0_level_0,transcript,position,REF,ALT,A,C,G,T,snp_total,CDS_start,CDS_stop
trans_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Spin1-201_848,Spin1-201,848,C,G,0,320,2,1,323,261,1049
Zbed3-202_471,Zbed3-202,471,T,C,0,0,0,217,217,174,860
Zbed3-202_707,Zbed3-202,707,A,G,218,0,0,0,218,174,860
E330034G19Rik-203_257,E330034G19Rik-203,257,A,C,125,0,0,0,125,56,1111
Zbed3-202_320,Zbed3-202,320,T,G,8,2,10,130,150,174,860
...,...,...,...,...,...,...,...,...,...,...,...
Gpd1-201_825,Gpd1-201,825,T,C,0,0,0,0,0,94,1143
Gpd1-201_508,Gpd1-201,508,C,T,0,0,0,0,0,94,1143
Gpd1-201_462,Gpd1-201,462,C,T,0,0,0,0,0,94,1143
Smarcd1-201_1699,Smarcd1-201,1699,G,A,0,0,0,0,0,194,1741


In [26]:
riboseq_snp_total_one_cell   = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_one_cell_exps] )
riboseq_snp_total_two_cell   = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_two_cell_exps] )
riboseq_snp_total_four_cell  = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_four_cell_exps] )
riboseq_snp_total_eight_cell = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_eigth_cell_exps] )


riboseq_snp_total_mii = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_mii_exps] )
riboseq_snp_total_gv  = add_snp_dfs( [riboseq_experiment_dfs[e] for e in  riboseq_gv_epxs] )

In [27]:
rnaseq_snp_total_one_cell   = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_one_cell_exps] )
rnaseq_snp_total_two_cell   = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_two_cell_exps] )
rnaseq_snp_total_four_cell  = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_four_cell_exps] )
rnaseq_snp_total_eight_cell = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_eight_cell_exps] )


rnaseq_snp_total_mii = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_mii_exps] )
rnaseq_snp_total_gv  = add_snp_dfs( [rnaseq_experiment_dfs[e] for e in  rnaseq_gv_epxs] )

We add the paternal / maternal counts ratios to our dataframes

In [28]:
def find_maternal_ratio(snp_df):
    maternal_ratios      = list()
    maternal_count       = list()
    paternal_count       = list()
    mat_pat_count        = list()
    
    for i, e in snp_df.iterrows():
        
        maternal_nuc  = e["REF"]
        paternal_nuc  = e["ALT"]
        mat_pat_total = e[maternal_nuc] + e[paternal_nuc]
        
        # Avoid division by 0
        if mat_pat_total == 0:
            mat_pat_total = 1
            
        maternal_ratio = e[maternal_nuc] / mat_pat_total
        
        mat_pat_count.append(mat_pat_total)
        maternal_ratios.append(maternal_ratio)
        maternal_count.append(e[maternal_nuc])
        paternal_count.append(e[paternal_nuc])
        
    snp_df["maternal_count"] = maternal_count 
    snp_df["paternal_count"] = paternal_count 
    snp_df["mat_pat_total"]  = mat_pat_count 
    snp_df["maternal_ratio"] = maternal_ratios
    
    return snp_df

In [29]:
find_maternal_ratio( riboseq_snp_total_one_cell )
find_maternal_ratio( riboseq_snp_total_two_cell ) 
find_maternal_ratio( riboseq_snp_total_four_cell ) 
find_maternal_ratio( riboseq_snp_total_eight_cell )

find_maternal_ratio( riboseq_snp_total_mii )
find_maternal_ratio( riboseq_snp_total_gv ) 

Unnamed: 0_level_0,transcript,position,REF,ALT,A,C,G,T,snp_total,CDS_start,CDS_stop,maternal_count,paternal_count,mat_pat_total,maternal_ratio
trans_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Padi6-201_165,Padi6-201,165,A,G,609,0,0,3,612,64,2112,609,0,609,1.000000
Zbed3-202_471,Zbed3-202,471,T,C,6,2,1,522,531,174,860,522,2,524,0.996183
Ldhb-201_969,Ldhb-201,969,T,C,0,3,0,409,412,112,1116,409,3,412,0.992718
Ankrd26-201_1877,Ankrd26-201,1877,C,A,0,367,0,0,367,170,5215,367,0,367,1.000000
Padi6-201_909,Padi6-201,909,T,C,4,1,0,279,284,64,2112,279,1,280,0.996429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gcnt3-201_563,Gcnt3-201,563,T,C,0,0,0,0,0,513,1826,0,0,1,0.000000
Gcnt3-201_737,Gcnt3-201,737,A,G,0,0,0,0,0,513,1826,0,0,1,0.000000
Gcnt3-201_986,Gcnt3-201,986,T,C,0,0,0,0,0,513,1826,0,0,1,0.000000
Gcnt3-201_1002,Gcnt3-201,1002,A,G,0,0,0,0,0,513,1826,0,0,1,0.000000


In [30]:
find_maternal_ratio( rnaseq_snp_total_one_cell )
find_maternal_ratio( rnaseq_snp_total_two_cell ) 
find_maternal_ratio( rnaseq_snp_total_four_cell ) 
find_maternal_ratio( rnaseq_snp_total_eight_cell ) 

find_maternal_ratio( rnaseq_snp_total_mii )
find_maternal_ratio( rnaseq_snp_total_gv )

Unnamed: 0_level_0,transcript,position,REF,ALT,A,C,G,T,snp_total,CDS_start,CDS_stop,maternal_count,paternal_count,mat_pat_total,maternal_ratio
trans_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H1f8-201_69,H1f8-201,69,T,C,1,3,0,6102,6106,46,960,6102,3,6105,0.999509
Ooep-201_257,Ooep-201,257,T,C,0,2,0,6037,6039,198,692,6037,2,6039,0.999669
Ftdc2-201_438,Ftdc2-201,438,C,T,1,5426,0,3,5430,367,945,5426,3,5429,0.999447
Padi6-201_222,Padi6-201,222,T,C,0,0,3,5223,5226,64,2112,5223,0,5223,1.000000
Zp3-201_141,Zp3-201,141,G,T,1,0,5031,0,5032,36,1310,5031,0,5031,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ltbp4-201_4658,Ltbp4-201,4658,A,G,0,0,0,0,0,126,5126,0,0,1,0.000000
Shkbp1-201_861,Shkbp1-201,861,G,A,0,0,0,0,0,64,2178,0,0,1,0.000000
Shkbp1-201_1359,Shkbp1-201,1359,G,T,0,0,0,0,0,64,2178,0,0,1,0.000000
Shkbp1-201_1992,Shkbp1-201,1992,G,A,0,0,0,0,0,64,2178,0,0,1,0.000000


In [31]:
riboseq_snp_total_four_cell

Unnamed: 0_level_0,transcript,position,REF,ALT,A,C,G,T,snp_total,CDS_start,CDS_stop,maternal_count,paternal_count,mat_pat_total,maternal_ratio
trans_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Cacna1b-201_6873,Cacna1b-201,6873,A,G,213,0,0,0,213,1,6984,213,0,213,1.000000
Ankrd26-201_1877,Ankrd26-201,1877,C,A,0,210,0,0,210,170,5215,210,0,210,1.000000
Npm1-201_731,Npm1-201,731,C,T,0,63,0,49,112,228,1106,63,49,112,0.562500
Zfp113-201_702,Zfp113-201,702,A,G,0,13,2,84,99,186,1505,0,2,2,0.000000
Eef2-201_1179,Eef2-201,1179,C,A,37,25,2,0,64,97,2673,25,37,62,0.403226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sorl1-201_5604,Sorl1-201,5604,G,A,0,0,0,0,0,185,6832,0,0,1,0.000000
Sc5d-201_530,Sc5d-201,530,A,G,0,0,0,0,0,141,1040,0,0,1,0.000000
Sc5d-201_790,Sc5d-201,790,A,G,0,0,0,0,0,141,1040,0,0,1,0.000000
Sc5d-201_824,Sc5d-201,824,C,T,0,0,0,0,0,141,1040,0,0,1,0.000000


In [32]:
proptest = robjects.r('prop.test')

def perform_prop_test(df_1, df_2, error_threshold = 0.01, filter_small_counts = True):
    """
    Decide whether paternal  ratios differ between df_1 and df_2
    
    Insufficient counts are marked with the p-value 10.
    
    If the difference between the proportionsare less than the error threshold, 
    they are marked with the p-value 5.
    """
    
    pvalues            = []
    conf_midpoints     = []
    diff_is_sufficient = []

    merged_df = df_1.merge(df_2, on = "trans_pos")
    
    
    for i, e in merged_df.iterrows():
        #print(i, e)
        if filter_small_counts:
            
            # Mark insufficient counts with p-val 10 and exclude them from the test
            if (e["maternal_count_x"]+ e["paternal_count_x"]) < 10  or\
               (e["maternal_count_y"] + e["paternal_count_y"]) < 10 :
                pvalues.append(10)
                conf_midpoints.append(10)
                diff_is_sufficient.append(0)
                continue


            if e["maternal_count_x"]+ e["maternal_count_y"] == 0 or\
               e["paternal_count_x"] + e["paternal_count_y"] == 0 :

                #print("detected zero at", i)
                pvalues.append(10)
                conf_midpoints.append(10)
                diff_is_sufficient.append(0)
                continue

            if e["paternal_count_x"] <=2 and e["paternal_count_y"] <= 2:
                pvalues.append(10)
                conf_midpoints.append(10)
                diff_is_sufficient.append(0)
                continue
        # A valuye 0 means the difference between the two ratios is 
        # very small
        # 1 means, the diffrences is NOT very small.
        # We will use it for filtering purposes
        if e["maternal_count_x"]+ e["paternal_count_x"] == 0 or\
               e["maternal_count_y"] + e["paternal_count_y"] == 0 :

                #print("detected zero at", i)
                pvalues.append(100)
                conf_midpoints.append(100)
                diff_is_sufficient.append(0)
                continue
        
        r_paternal_counts = robjects.IntVector([ e["paternal_count_x"], e["paternal_count_y"] ])
        r_total_counts    = robjects.IntVector([ e["paternal_count_x"] + e["maternal_count_x"], 
                                                 e["paternal_count_y"] + e["maternal_count_y"] ])
        
        proptest_result = proptest(r_paternal_counts, r_total_counts)
        result_dict     = dict(proptest_result.items())
        conf_int        = result_dict["conf.int"]
        mid_point_conf  = (conf_int[0] + conf_int[1]) / 2
        

        
        if conf_int[1] < 0 and error_threshold > ((-1) * conf_int[1]):
            diff_is_sufficient.append(0)
        elif conf_int[1] >=0 and (error_threshold > conf_int[0]):
            diff_is_sufficient.append(0)
        else:
            diff_is_sufficient.append(1)
        
        this_p_val = proptest_result[2][0]
        pvalues.append(this_p_val)
        
        conf_midpoints.append(mid_point_conf)  
        
    result_dict = { #"trans_pos" : merged_df["trans_pos"],
                    "p_val":               pvalues,
                    "conf_mid":            conf_midpoints,
                    "maternal_x":          merged_df["maternal_count_x"],
                    "paternal_x":          merged_df["paternal_count_x"],
                    "maternal_y":          merged_df["maternal_count_y"],
                    "paternal_y":          merged_df["paternal_count_y"],
                    "diff_is_sufficient" : diff_is_sufficient,  }
    
    
    
    result_df =  pd.DataFrame.from_dict( result_dict )
    
    return result_df

## Transcript Level SNPs

We aggregate paternal & paternal counts of transcripts. Next, we determine transcripts chaning behavior ( in terms of paternal / maternal ratio). 

In [33]:
def get_transcript_level_counts(snp_df):
    result_dict = {}
    
    for g in snp_df["transcript"]:
        result_dict[g] = { "transcript": g, "maternal_count": 0, "paternal_count": 0 }
        #result_dict[g] = { "maternal_count": 0, "paternal_count": 0 }
        
    for i, e in snp_df.iterrows():
        
        g = e["transcript"]
        
        result_dict[g]["maternal_count"] += e["maternal_count"] 
        result_dict[g]["paternal_count"] += e["paternal_count"]
        
    result_df = pd.DataFrame.from_dict( result_dict ).transpose()
    result_df.index.rename("trans_pos", inplace = True)

    
    return result_df

In [34]:
riboseq_t_level_snp_total_gv         = get_transcript_level_counts(riboseq_snp_total_gv)
riboseq_t_level_snp_total_mii        = get_transcript_level_counts(riboseq_snp_total_mii )
riboseq_t_level_snp_total_one_cell   = get_transcript_level_counts(riboseq_snp_total_one_cell )
riboseq_t_level_snp_total_two_cell   = get_transcript_level_counts(riboseq_snp_total_two_cell )
riboseq_t_level_snp_total_four_cell  = get_transcript_level_counts(riboseq_snp_total_four_cell )
riboseq_t_level_snp_total_eight_cell = get_transcript_level_counts(riboseq_snp_total_eight_cell )

In [35]:
rnaseq_t_level_snp_total_mii        = get_transcript_level_counts(rnaseq_snp_total_mii )
rnaseq_t_level_snp_total_gv         = get_transcript_level_counts(rnaseq_snp_total_gv )
rnaseq_t_level_snp_total_one_cell   = get_transcript_level_counts(rnaseq_snp_total_one_cell )
rnaseq_t_level_snp_total_two_cell   = get_transcript_level_counts(rnaseq_snp_total_two_cell )
rnaseq_t_level_snp_total_four_cell  = get_transcript_level_counts(rnaseq_snp_total_four_cell )
rnaseq_t_level_snp_total_eight_cell = get_transcript_level_counts(rnaseq_snp_total_eight_cell )

### Perform PropTest

We will use the results of prop-test for filtering purposes. 

Namely, the genes that were used in the prop-test are the **"expressed"** genes.

In [36]:


proptest_t_level_gv_ribo_rna         = perform_prop_test( riboseq_t_level_snp_total_gv, 
                                                          rnaseq_t_level_snp_total_gv,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

proptest_t_level_mii_ribo_rna        = perform_prop_test( riboseq_t_level_snp_total_mii, 
                                                          rnaseq_t_level_snp_total_mii,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

proptest_t_level_one_cell_ribo_rna   = perform_prop_test( riboseq_t_level_snp_total_one_cell, 
                                                          rnaseq_t_level_snp_total_one_cell,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

proptest_t_level_two_cell_ribo_rna   = perform_prop_test( riboseq_t_level_snp_total_two_cell, 
                                                          rnaseq_t_level_snp_total_two_cell,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

proptest_t_level_four_cell_ribo_rna  = perform_prop_test( riboseq_t_level_snp_total_four_cell, 
                                                          rnaseq_t_level_snp_total_four_cell,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

proptest_t_level_eight_cell_ribo_rna = perform_prop_test( riboseq_t_level_snp_total_eight_cell, 
                                                          rnaseq_t_level_snp_total_eight_cell,
                                                          error_threshold = PROPTEST_ERROR_THRESHOLD)

In [37]:
proptest_t_level_four_cell_ribo_rna

Unnamed: 0_level_0,p_val,conf_mid,maternal_x,paternal_x,maternal_y,paternal_y,diff_is_sufficient
trans_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cacna1b-201,10.000000,10.000000,213,0,2,0,0
Ankrd26-201,10.000000,10.000000,212,0,3,0,0
Npm1-201,0.123346,-0.073905,76,61,527,569,0
Zfp113-201,10.000000,10.000000,0,2,3,2,0
Eef2-201,0.014625,0.146279,73,105,74,59,0
...,...,...,...,...,...,...,...
Grik4-202,10.000000,10.000000,0,0,0,0,0
Clmp-201,10.000000,10.000000,0,0,1,0,0
Ubash3b-201,10.000000,10.000000,0,0,1,0,0
Sc5d-201,10.000000,10.000000,0,0,2,2,0


In [38]:
fourcell_expressed_genes_df = (proptest_t_level_four_cell_ribo_rna[  proptest_t_level_four_cell_ribo_rna["p_val"] <=1 ])\
    [ ["maternal_x", "paternal_x", "maternal_y", "paternal_y"]]

gene_names_4cell = list(map(lambda x: x.split("-")[0], fourcell_expressed_genes_df.index)  )

fourcell_expressed_genes_df.set_index( pd.Index(gene_names_4cell).rename("gene") , inplace = True)

fourcell_expressed_genes_df

Unnamed: 0_level_0,maternal_x,paternal_x,maternal_y,paternal_y
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Npm1,76,61,527,569
Eef2,73,105,74,59
Timd2,45,45,198,179
Ddx54,12,17,34,16
Rps11,16,47,1435,1610
...,...,...,...,...
Swt1,8,5,50,30
Ddias,5,6,31,63
Hmmr,10,6,21,20
Sgo2a,10,0,11,12


In [39]:
eightcell_expressed_genes_df = (proptest_t_level_eight_cell_ribo_rna[  proptest_t_level_eight_cell_ribo_rna["p_val"] <=1 ])\
    [ ["maternal_x", "paternal_x", "maternal_y", "paternal_y"]]

gene_names_8cell = list(map(lambda x: x.split("-")[0], eightcell_expressed_genes_df.index)  )

eightcell_expressed_genes_df.set_index( pd.Index(gene_names_8cell).rename("gene") , inplace = True)

eightcell_expressed_genes_df


Unnamed: 0_level_0,maternal_x,paternal_x,maternal_y,paternal_y
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Npm1,142,158,1381,1313
Hspa8,158,139,1028,1170
Eef2,192,159,388,392
Timd2,186,130,744,572
Rpl38,44,67,1992,1634
...,...,...,...,...
Nol8,7,5,50,13
Arhgap21,2,11,27,31
Wdfy3,4,6,8,3
Nup205,7,4,34,50


In [40]:
fourcell_expressed_genes_df.to_csv("fourcell_snp_counts.csv.gz")
eightcell_expressed_genes_df.to_csv("eightcell_snp_counts.csv.gz")

In [44]:
fourcell_all_snp_counts_df  = proptest_t_level_four_cell_ribo_rna[ ["maternal_x", "paternal_x", "maternal_y", "paternal_y"] ]

eightcell_all_snp_counts_df = proptest_t_level_eight_cell_ribo_rna[ ["maternal_x", "paternal_x", "maternal_y", "paternal_y"] ]

gene_names_4cell_all = list(map(lambda x: x.split("-")[0], fourcell_all_snp_counts_df.index)  )
gene_names_8cell_all = list(map(lambda x: x.split("-")[0], eightcell_all_snp_counts_df.index)  )

fourcell_all_snp_counts_df.set_index( pd.Index(gene_names_4cell_all).rename("gene"), inplace = True )
eightcell_all_snp_counts_df.set_index( pd.Index(gene_names_8cell_all).rename("gene"), inplace = True )

fourcell_all_snp_counts_df.to_csv("fourcell_all_snp_counts.csv.gz")
eightcell_all_snp_counts_df.to_csv("eightcell_all_snp_counts.csv.gz")