# A closer look at the SNPs of Highlighted Genes

We hhiglighted 24 genes that show striking difference in maternal and paternal allele translation and activation in between stages. Here we study the positions of the SNPs, the regions they come from (5' UTR, CDS, or 3' UTR) and the nucleotide sequences around those SNPs.

In [1]:

import numpy as np
import pandas as pd
import copy

import gzip

import sys
sys.path.insert(0, '../../snp')
from ref_lib.Fasta import FastaEntry, FastaFile

In [2]:
codon_to_amio_acid = {
    "TTT": "Phe",
    "TTC": "Phe",
    
    "TTA": "Leu",
    "TTG": "Leu",
    "CTT": "Leu",
    "CTC": "Leu",
    "CTA": "Leu",
    "CTG": "Leu",
    
    "ATT": "Ile",
    "ATC": "Ile",
    "ATA": "Ile",
    
    "ATG": "Met",
    
    "GTT": "Val",
    "GTC": "Val",
    "GTA": "Val",
    "GTG": "Val",
    
    "TCT": "Ser",
    "TCC": "Ser",
    "TCA": "Ser",
    "TCG": "Ser",
    
    "CCT": "Pro",
    "CCC": "Pro",
    "CCA": "Pro",
    "CCG": "Pro",
    
    "ACT": "Thr",
    "ACC": "Thr",
    "ACA": "Thr",
    "ACG": "Thr",
    
    "GCT": "Ala",
    "GCC": "Ala",
    "GCA": "Ala",
    "GCG": "Ala",
    
    "TAT": "Tyr",
    "TAC": "Tyr",
    
    "TAA": "STOP",
    "TAG": "STOP",
    
    "CAT": "His",
    "CAC": "His",
    
    "CAA": "Gln",
    "CAG": "Gln",
    
    "AAT": "Asn",
    "AAC": "Asn",
    
    "AAA": "Lys",
    "AAG": "Lys",
    
    "GAT": "Asp",
    "GAC": "Asp",
    
    "GAA": "Glu",
    "GAG": "Glu",
    
    "TGT": "Cys",
    "TGC": "Cys",
    
    "TGA": "STOP",
    
    "TGG": "Trp",
    
    "CGT": "Arg",
    "CGC": "Arg",
    "CGA": "Arg",
    "CGG": "Arg",
    
    "AGT": "Ser",
    "AGC": "Ser",
    
    "AGA": "Arg",
    "AGG": "Arg",
    
    "GGT": "Gly",
    "GGC": "Gly",
    "GGA": "Gly",
    "GGG": "Gly"    
}

In [3]:
codon_to_amio_acid["AGC"]

'Ser'

In [4]:
# This comes from our prior porportionality analysis

list_of_genes = (
  'Nop14',
  'Tmppe', 
  'Slc13a2',
  'Ppp2ca',
  'Srpk1',
  'Cbx3',
  'Ncoa3',
  'Cdk1',
  'Baz1a',
  'Dyrk3',
  'Lclat1',
  'Lyar',
  'Umps',
  'Tsen2',
  'Ccnh',
  
  'Folr1',
  'Pa2g4',
  'Zfp296',
  'Mrps9',
  'Eif3d',
  'Nin',
  'Ddx21',
  'Bcat1',
  'Mysm1'
)

cluster_1_genes = (  'Nop14',
                     'Slc13a2'
                  )

cluster_2_genes = ('Cbx3',
                   'Srpk1',
                   'Umps',
                   'Mysm1',
                   'Ppp2ca',
                   'Bcat1'
                   )

cluster_3_genes = ('Folr1',
                   'Zfp296',
                   'Nin',
                   'Ddx21',
                   'Eif3d',
                   'Pa2g4',
                   'Mrps9',
                   'Tsen2'
                  )

cluster_4_genes = ('Cdk1',
                   'Baz1a',
                   'Lclat1',
                   'Ncoa3',
                   'Lyar',
                   'Dyrk3',
                   'Ccnh',
                   'Tmppe' 
                  )

In [13]:
ribo_detailed_snp_file      = "./snp_dataframes/riboseq_detailed_snps.csv.gz"

transcriptome_sequence_file = "../../../mouse_itp_reference/transcriptome/varnt_masked_and_filtered_mouse_transcriptome.fa.gz"

annotation_file             = "../../../mouse_itp_reference/transcriptome/appris_mouse_v2_filtered_regions.bed"

all_transcriptomic_snps_file = "./transcriptomic_variants.vcf.gz"

In [14]:
mouse_sequences = dict()

with FastaFile(transcriptome_sequence_file) as mouse_transcriptome:
    for entry in mouse_transcriptome:
        this_gene = entry.header.split("|")[5]
        mouse_sequences[ this_gene ] = entry.sequence

In [16]:
## initialize the dataframe
## Simply pick all the entries from the transcrioptomic variants file

## VERY IMPORTNAT:
## VCF is 1-based
# So we subtract 1 and convert it to 0-based
# wgile reading the VCF file

df_fields = {   "transcript" : list(), 
                "gene"       : list(),
                "position"   : list(),  
                "maternal"   : list(),
                "paternal"   : list(),
                "chromosome" : list(),
                "chr_pos"    : list(),
                "strand"     : list() }


with gzip.open(all_transcriptomic_snps_file, "rt") as input_stream:
    for line in input_stream:
        if line.startswith("#"):
            continue

        line_contents   = line.strip().split("\t")
        label_contents  = line_contents[0].split("|")

        this_gene       = label_contents[5]
        this_transcript = label_contents[0].split(".")[0]

        if this_gene in list_of_genes:
            df_fields["transcript"].append( this_transcript )
            df_fields["gene"].append(this_gene)
            #Convert 1-based coordinate to 0-vased by -1
            df_fields["position"].append( int(line_contents[1]) - 1 )
            df_fields["maternal"].append(line_contents[3])
            df_fields["paternal"].append(line_contents[4])
            df_fields["chromosome"].append(line_contents[10])
            df_fields["chr_pos"].append(line_contents[11])
            df_fields["strand"].append(line_contents[12])



In [17]:
snp_df = pd.DataFrame(df_fields)

In [18]:
cds_boundaries = dict()

annotation_file = "../../../mouse_itp_reference/transcriptome/appris_mouse_v2_filtered_regions.bed"

with open(annotation_file, "r") as annotation_stream:
    for entry in annotation_stream:
        contents = entry.split()
        
        if contents[3] == "CDS":
            this_gene                 = contents[0].split("|")[5]
            cds_boundaries[this_gene] = [int(contents[1]), int(contents[2])]
            
        

In [19]:
def determine_region(gene, position, cds_boundaries):
    this_region = "CDS"
    
    if position < cds_boundaries[gene][0]:
        this_region = "UTR5"
        
    if position > cds_boundaries[gene][1]:
        this_region = "UTR3"
    
    return this_region

In [20]:
### Now add the region column

snp_regions = list()

for ind, contents in snp_df.iterrows():
    this_region = determine_region( contents["gene"], int(contents["position"] ), cds_boundaries )
    snp_regions.append(this_region)

snp_df["region"] = snp_regions

In [21]:
## Next add the sequence around the snp



left_span  = 4
right_span = 4

sequences_around_snps = list()

for index, entry in  snp_df.iterrows():
    # Note that our snp file is 1-based!
    # So we need the -1 adjustment
    sequence_start = entry["position"] - left_span
    sequence_stop  = entry["position"] + right_span +1 
    sequence       = mouse_sequences[ entry["gene"] ][sequence_start : sequence_stop]
    sequences_around_snps.append(sequence)

snp_df["sequence"] = sequences_around_snps

In [22]:
snp_df

Unnamed: 0,transcript,gene,position,maternal,paternal,chromosome,chr_pos,strand,region,sequence
0,ENSMUST00000045866,Ddx21,4410,A,G,chr10,62580562,-,UTR3,TTAANTGAA
1,ENSMUST00000045866,Ddx21,4399,A,G,chr10,62580573,-,UTR3,ATTGNCACA
2,ENSMUST00000045866,Ddx21,4063,C,T,chr10,62580909,-,UTR3,CTCCNGGTA
3,ENSMUST00000045866,Ddx21,3913,G,A,chr10,62581059,-,UTR3,TTGGNAGGA
4,ENSMUST00000045866,Ddx21,3895,C,A,chr10,62581077,-,UTR3,AGGGNTCTT
...,...,...,...,...,...,...,...,...,...,...
523,ENSMUST00000111820,Tmppe,2175,A,G,chr9,114406531,+,UTR3,GATTNCTGT
524,ENSMUST00000111820,Tmppe,2591,T,G,chr9,114406947,+,UTR3,TCACNCATA
525,ENSMUST00000111820,Tmppe,2708,C,T,chr9,114407064,+,UTR3,TGGGNCTGA
526,ENSMUST00000111820,Tmppe,2721,A,T,chr9,114407077,+,UTR3,TTAGNGAGA


In [23]:
# We determine the amino acid change for the SNPS in the CDS

maternal_sequences = list()
paternal_sequences = list()

maternal_aa = list()
paternal_aa = list()

for index, entry in  snp_df.iterrows():
    if entry["region"] != "CDS":
        maternal_sequences.append("NA")
        paternal_sequences.append("NA")
        maternal_aa.append("NA")
        paternal_aa.append("NA")
        continue

    this_position  = entry["position"]
    cds_start      = cds_boundaries[entry["gene"]][0]
    this_frame = (this_position - cds_start) % 3

    codon_start = this_position - this_frame
    raw_triplet = list(mouse_sequences[entry["gene"]][ codon_start: codon_start + 3 ] )

    maternal_triplet = copy.deepcopy(raw_triplet)
    paternal_triplet = copy.deepcopy(raw_triplet)

    maternal_triplet[this_frame] = entry["maternal"]
    paternal_triplet[this_frame] = entry["paternal"]

    maternal_triplet = "".join(maternal_triplet)
    paternal_triplet = "".join(paternal_triplet)
    maternal_sequences.append( maternal_triplet )
    paternal_sequences.append( paternal_triplet )
    maternal_aa.append(codon_to_amio_acid[maternal_triplet])
    paternal_aa.append(codon_to_amio_acid[paternal_triplet])

snp_df["maternal_codon"] = maternal_sequences
snp_df["paternal_codon"] = paternal_sequences

snp_df["maternal_AA"] = maternal_aa
snp_df["paternal_AA"] = paternal_aa




    


In [24]:
utr3_snps = snp_df[ snp_df["region"] == "UTR3"  ]
utr5_snps = snp_df[ snp_df["region"] == "UTR5"  ]

cds_snps        = snp_df[ snp_df["region"] == "CDS"  ]
synonymous_snps =  np.sum(cds_snps["maternal_AA"] == cds_snps["paternal_AA"] )

total_cds_snps = len(cds_snps)

non_syn_snps   = total_cds_snps - synonymous_snps



## Summary

In [25]:
print("The total number of snps is {}\n".format(len(snp_df)))

print("The distribution of the snps to the regions \nUTR5, CDS and UTR3 are:\n{}, {}, {}, respectively.\n".\
      format(len(utr5_snps), len(cds_snps), len(utr3_snps)  ))

print("For the CDS snps,\n{} of them are synonymous and\n{} of them are non-synonymous.\n".format(synonymous_snps, non_syn_snps))

The total number of snps is 528

The distribution of the snps to the regions 
UTR5, CDS and UTR3 are:
17, 187, 324, respectively.

For the CDS snps,
134 of them are synonymous and
53 of them are non-synonymous.



In [26]:
## Let's write the results

snp_df.to_csv("./snp_dataframes/selected_snps.csv.gz")


In [27]:

cluster_list = (cluster_1_genes, cluster_2_genes, cluster_3_genes, cluster_4_genes)

for i in range(1,5):
    this_cluster = cluster_list[i-1]

    with gzip.open("./snp_dataframes/cluster_{}_sequences.fa.gz".format(i), "wt") as output_stream,\
         gzip.open("./snp_dataframes/cluster_{}_cds_sequences.fa.gz".format(i), "wt") as cds_stream,\
         gzip.open("./snp_dataframes/cluster_{}_utr3_sequences.fa.gz".format(i), "wt") as utr3_stream,\
         gzip.open("./snp_dataframes/cluster_{}_utr5_sequences.fa.gz".format(i), "wt") as utr5_stream:
        for ind, contents in snp_df.iterrows():
            if contents["gene"] in this_cluster:
                this_str = ">{}_{}\n{}".format(contents["gene"], contents["position"], contents["sequence"]) 
                print(this_str, file = output_stream)

                if contents["region"] == "CDS":
                    print(this_str, file = cds_stream)
                if contents["region"] == "UTR3":
                    print(this_str, file = utr3_stream)
                if contents["region"] == "UTR5":
                    print(this_str, file = utr5_stream)

In [28]:
snp_df

Unnamed: 0,transcript,gene,position,maternal,paternal,chromosome,chr_pos,strand,region,sequence,maternal_codon,paternal_codon,maternal_AA,paternal_AA
0,ENSMUST00000045866,Ddx21,4410,A,G,chr10,62580562,-,UTR3,TTAANTGAA,,,,
1,ENSMUST00000045866,Ddx21,4399,A,G,chr10,62580573,-,UTR3,ATTGNCACA,,,,
2,ENSMUST00000045866,Ddx21,4063,C,T,chr10,62580909,-,UTR3,CTCCNGGTA,,,,
3,ENSMUST00000045866,Ddx21,3913,G,A,chr10,62581059,-,UTR3,TTGGNAGGA,,,,
4,ENSMUST00000045866,Ddx21,3895,C,A,chr10,62581077,-,UTR3,AGGGNTCTT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,ENSMUST00000111820,Tmppe,2175,A,G,chr9,114406531,+,UTR3,GATTNCTGT,,,,
524,ENSMUST00000111820,Tmppe,2591,T,G,chr9,114406947,+,UTR3,TCACNCATA,,,,
525,ENSMUST00000111820,Tmppe,2708,C,T,chr9,114407064,+,UTR3,TGGGNCTGA,,,,
526,ENSMUST00000111820,Tmppe,2721,A,T,chr9,114407077,+,UTR3,TTAGNGAGA,,,,


In [35]:
codon_table_of_selected_genes = snp_df[snp_df["region"] == "CDS"][["gene", "position", "maternal", "paternal", "chromosome", "chr_pos", "maternal_codon", "paternal_codon", "maternal_AA", "paternal_AA"]]
codon_table_of_selected_genes

Unnamed: 0,gene,position,maternal,paternal,chromosome,chr_pos,maternal_codon,paternal_codon,maternal_AA,paternal_AA
23,Ddx21,2400,C,T,chr10,62582572,ACC,ACT,Thr,Thr
24,Ddx21,2397,C,T,chr10,62582575,CTC,CTT,Leu,Leu
25,Ddx21,1557,G,A,chr10,62589889,GCG,GCA,Ala,Ala
26,Ddx21,698,G,A,chr10,62598347,AGC,AAC,Ser,Asn
88,Cdk1,905,C,T,chr10,69340499,AAC,AAT,Asn,Asn
...,...,...,...,...,...,...,...,...,...,...
508,Folr1,528,A,G,chr7,101863970,AAA,AAG,Lys,Lys
514,Tmppe,779,G,A,chr9,114405135,GCG,GCA,Ala,Ala
515,Tmppe,1013,C,A,chr9,114405369,GGC,GGA,Gly,Gly
516,Tmppe,1121,T,C,chr9,114405477,TAT,TAC,Tyr,Tyr


In [39]:
codon_table_of_selected_genes.to_csv("./snp_dataframes/codon_table_of_selected_genes.csv", index = False)