# A closer look at the SNPs of Highlighted Genes

We hhiglighted 24 genes that show striking difference in maternal and paternal allele translation and activation in between stages. Here we study the positions of the SNPs, the regions they come from (5' UTR, CDS, or 3' UTR) and the nucleotide sequences around those SNPs.

In [3]:

import numpy as np
import pandas as pd

import sys
sys.path.insert(0, '../../snp')
from ref_lib.Fasta import FastaEntry, FastaFile

In [79]:
codon_to_amio_acid = {
    "TTT": "Phe",
    "TTC": "Phe",
    
    "TTA": "Leu",
    "TTG": "Leu",
    "CTT": "Leu",
    "CTC": "Leu",
    "CTA": "Leu",
    "CTG": "Leu",
    
    "ATT": "Ile",
    "ATC": "Ile",
    "ATA": "Ile",
    
    "ATG": "Met",
    
    "GTT": "Val",
    "GTC": "Val",
    "GTA": "Val",
    "GTG": "Val",
    
    "TCT": "Ser",
    "TCC": "Ser",
    "TCA": "Ser",
    "TCG": "Ser",
    
    "CCT": "Pro",
    "CCC": "Pro",
    "CCA": "Pro",
    "CCG": "Pro",
    
    "ACT": "Thr",
    "ACC": "Thr",
    "ACA": "Thr",
    "ACG": "Thr",
    
    "GCT": "Ala",
    "GCC": "Ala",
    "GCA": "Ala",
    "GCG": "Ala",
    
    "TAT": "Tyr",
    "TAC": "Tyr",
    
    "TAA": "STOP",
    "TAG": "STOP",
    
    "CAT": "His",
    "CAC": "His",
    
    "CAA": "Gln",
    "CAG": "Gln",
    
    "AAT": "Asn",
    "AAC": "Asn",
    
    "AAA": "Lys",
    "AAG": "Lys",
    
    "GAT": "Asp",
    "GAC": "Asp",
    
    "GAA": "Glu",
    "GAG": "Glu",
    
    "TGT": "Cys",
    "TGC": "Cys",
    
    "TGA": "STOP",
    
    "TGG": "Trp",
    
    "CGT": "Arg",
    "CGC": "Arg",
    "CGA": "Arg",
    "CGG": "Arg",
    
    "AGT": "Ser",
    "AGC": "Ser",
    
    "AGA": "Arg",
    "AGG": "Arg",
    
    "GGT": "Gly",
    "GGC": "Gly",
    "GGA": "Gly",
    "GGG": "Gly"    
}

In [83]:
codon_to_amio_acid["AGC"]

'Ser'

In [4]:
# This comes from our prior porportionality analysis

list_of_genes = (
  'Nop14',
  'Tmppe', 
  'Slc13a2',
  'Ppp2ca',
  'Srpk1',
  'Cbx3',
  'Ncoa3',
  'Cdk1',
  'Baz1a',
  'Dyrk3',
  'Lclat1',
  'Lyar',
  'Umps',
  'Tsen2',
  'Ccnh',
  
  'Folr1',
  'Pa2g4',
  'Zfp296',
  'Mrps9',
  'Eif3d',
  'Nin',
  'Ddx21',
  'Bcat1',
  'Mysm1'
)

In [5]:
ribo_detailed_snp_file      = "./snp_dataframes/riboseq_detailed_snps.csv.gz"

transcriptome_sequence_file = "../../../mouse_itp_reference/transcriptome/varnt_masked_and_filtered_mouse_transcriptome.fa.gz"

annotation_file             = "../../../mouse_itp_reference/transcriptome/appris_mouse_v2_filtered_regions.bed"

all_transcriptomic_snps_file = "../../snp/reference_files/transcriptomic_variants.vcf.gz"

In [44]:
mouse_sequences = dict()

with FastaFile(transcriptome_sequence_file) as mouse_transcriptome:
    for entry in mouse_transcriptome:
        this_gene = entry.header.split("|")[5]
        mouse_sequences[ this_gene ] = entry.sequence

In [15]:
ribo_all_snps = pd.read_csv(ribo_detailed_snp_file, index_col=0)

In [25]:
# Randomly pick the first experiment first

one_exp_slice = ribo_all_snps.loc["20210513-ITP-1cell-cross-50-A"]
#ribo_all_snps.loc[""]

In [26]:
one_exp_slice

Unnamed: 0_level_0,transcript,position,paternal,maternal,REF,ALT,A,C,G,T
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20210513-ITP-1cell-cross-50-A,Spin1-201,848,0,139,C,G,0,139,0,0
20210513-ITP-1cell-cross-50-A,Zbed3-202,471,0,124,T,C,0,0,0,124
20210513-ITP-1cell-cross-50-A,Zbed3-202,707,0,100,A,G,100,0,0,0
20210513-ITP-1cell-cross-50-A,E330034G19Rik-203,257,0,74,A,C,74,0,0,0
20210513-ITP-1cell-cross-50-A,Zbed3-202,320,4,52,T,G,4,0,4,52
...,...,...,...,...,...,...,...,...,...,...
20210513-ITP-1cell-cross-50-A,Gpd1-201,825,0,0,T,C,0,0,0,0
20210513-ITP-1cell-cross-50-A,Gpd1-201,508,0,0,C,T,0,0,0,0
20210513-ITP-1cell-cross-50-A,Gpd1-201,462,0,0,C,T,0,0,0,0
20210513-ITP-1cell-cross-50-A,Smarcd1-201,1699,0,0,G,A,0,0,0,0


In [37]:
one_exp_slice["transcript"] = list(map(  lambda x: x.split("-")[0], one_exp_slice["transcript"] ) )
one_exp_slice = one_exp_slice.filter(items = ["transcript", "position", "REF", "ALT"], axis = 1)
one_exp_slice.reset_index(drop=True, inplace=True)

In [38]:
one_exp_slice

Unnamed: 0,transcript,position,REF,ALT
0,Spin1,848,C,G
1,Zbed3,471,T,C
2,Zbed3,707,A,G
3,E330034G19Rik,257,A,C
4,Zbed3,320,T,G
...,...,...,...,...
85334,Gpd1,825,T,C
85335,Gpd1,508,C,T
85336,Gpd1,462,C,T
85337,Smarcd1,1699,G,A


In [65]:
cds_boundaries = dict()

annotation_file = "../../../mouse_itp_reference/transcriptome/appris_mouse_v2_filtered_regions.bed"

with open(annotation_file, "r") as annotation_stream:
    for entry in annotation_stream:
        contents = entry.split()
        
        if contents[3] == "CDS":
            this_gene                 = contents[0].split("|")[5]
            cds_boundaries[this_gene] = [int(contents[1]), int(contents[2])]
            
        

In [67]:
def determine_region(gene, position, cds_boundaries):
    this_region = "CDS"
    
    if position < cds_boundaries[gene][0]:
        this_region = "UTR5"
        
    if position > cds_boundaries[gene][1]:
        this_region = "UTR3"
    
    return this_region

In [68]:
cds_boundaries

{'Xkr4': [150, 2091],
 'Rp1': [54, 4167],
 'Sox17': [1082, 2339],
 'Mrpl15': [62, 947],
 'Lypla1': [91, 781],
 'Gm37988': [22, 208],
 'Tcea1': [100, 1003],
 'Rgs20': [160, 877],
 'Atp6v1h': [159, 1554],
 'Oprk1': [185, 1325],
 'Npbwr1': [105, 1092],
 'Rb1cc1': [467, 5231],
 'Alkal1': [113, 494],
 'St18': [420, 3555],
 'Pcmtd1': [412, 1483],
 'Sntg1': [1333, 2884],
 'Rrs1': [116, 1211],
 'Adhfe1': [145, 1540],
 'Vxn': [385, 1006],
 'Mybl1': [255, 2508],
 'Vcpip1': [226, 3886],
 'Sgk3': [333, 1821],
 'Mcmdc2': [97, 2140],
 'Tcf24': [447, 945],
 'Ppp1r42': [155, 1226],
 'Cops5': [315, 1317],
 'Cspp1': [138, 3729],
 'Arfgef1': [176, 5714],
 'Cpa6': [218, 1532],
 'Prex2': [328, 5122],
 'A830018L16Rik': [464, 1841],
 'Sulf1': [635, 3245],
 'Slco5a1': [640, 3190],
 'Prdm14': [75, 1758],
 'Ncoa2': [222, 4608],
 'Tram1': [188, 1310],
 'Lactb2': [96, 960],
 'Xkr9': [464, 1583],
 'Eya1': [609, 2382],
 'Msc': [242, 845],
 'Trpa1': [27, 3402],
 'Kcnb2': [0, 2721],
 'Terf1': [32, 1295],
 'Sbspon': [

In [61]:
! head -n 10 ../../../mouse_itp_reference/transcriptome/appris_mouse_v2_filtered_regions.bed

ENSMUST00000070533.4|ENSMUSG00000051951.5|OTTMUSG00000026353.2|OTTMUST00000065166.1|Xkr4-201|Xkr4|3634|UTR5:1-150|CDS:151-2094|UTR3:2095-3634|	0	150	UTR5	0	+
ENSMUST00000070533.4|ENSMUSG00000051951.5|OTTMUSG00000026353.2|OTTMUST00000065166.1|Xkr4-201|Xkr4|3634|UTR5:1-150|CDS:151-2094|UTR3:2095-3634|	150	2091	CDS	0	+
ENSMUST00000070533.4|ENSMUSG00000051951.5|OTTMUSG00000026353.2|OTTMUST00000065166.1|Xkr4-201|Xkr4|3634|UTR5:1-150|CDS:151-2094|UTR3:2095-3634|	2091	3634	UTR3	0	+
ENSMUST00000208660.1|ENSMUSG00000025900.13|OTTMUSG00000049985.3|OTTMUST00000145515.1|Rp1-202|Rp1|4170|UTR5:1-54|CDS:55-4170|	0	54	UTR5	0	+
ENSMUST00000208660.1|ENSMUSG00000025900.13|OTTMUSG00000049985.3|OTTMUST00000145515.1|Rp1-202|Rp1|4170|UTR5:1-54|CDS:55-4170|	54	4167	CDS	0	+
ENSMUST00000208660.1|ENSMUSG00000025900.13|OTTMUSG00000049985.3|OTTMUST00000145515.1|Rp1-202|Rp1|4170|UTR5:1-54|CDS:55-4170|	4167	4170	UTR3	0	+
ENSMUST00000027035.9|ENSMUSG00000025902.13|OTTMUSG00000050014.7|OTTMUST00000127245.2|Sox17

In [69]:
# Note that our snp file is 1-based!
# So we need the -1 adjustment

left_span  = 3
right_span = 3

sequences_arounds_snps = list()
regions                = list()

for index, entry in genes_df.iterrows():
    sequence_start = entry["position"] - 1 - left_span
    sequence_stop  = entry["position"] + right_span 
    sequence       = mouse_sequences[ entry["transcript"] ][sequence_start : sequence_stop]
    sequences_arounds_snps.append(sequence)
    
    this_region = determine_region(entry["transcript"], entry["position"], cds_boundaries)
    regions.append(this_region)

genes_df["sequence"] = sequences_arounds_snps
genes_df["region"]   = regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [77]:
genes_df

Unnamed: 0,transcript,position,REF,ALT,sequence,region
393,Nin,1674,G,A,AGTNGAT,CDS
781,Nin,2571,A,G,GGANAGG,CDS
901,Nin,729,A,G,AACNATG,CDS
927,Nin,720,T,C,TCCNGAT,CDS
1170,Mysm1,1059,A,C,TCANGTG,CDS
...,...,...,...,...,...,...
83733,Eif3d,1296,C,T,CAANGGG,CDS
83734,Eif3d,1278,T,C,TGGNGTC,CDS
83735,Eif3d,1176,G,A,GGANGAC,CDS
83736,Eif3d,1029,T,C,CAANGAG,CDS


In [73]:
cds_boundaries["Eif3d"]

[141, 1785]

In [78]:
mouse_sequences["Eif3d"][141:144]

'ATG'

In [85]:
"c" in ("a", "c", "e")

True