# App 2.1

February 10th, 2022

In [1]:
import pandas as pd
from helpers.App_2_1_utils import (
    read_genes, construct_table
)

UTR_SEQUENCE_MOUSE_DATA_PATH = "../data/UTR_Sequences/UTR_sequences_mouse.txt"
MARCOPHAGE_RIBOSEQ_HKAZAN_GENES_PATH = "../data/RPF_genes/macrophage_riboseq_hkazan_genes_2022-02-10.txt"

# Reflect changes in the modules immediately.
# @formatter:off
%load_ext autoreload
%autoreload 2
# @formatter:on

In [2]:
MOTIF_LIST = [
    "AUUUA",
    "UAUUUAU",
]

In [3]:
UTR_seq_mouse_data = pd.read_csv(UTR_SEQUENCE_MOUSE_DATA_PATH, sep="\t")
UTR_seq_mouse_data

Unnamed: 0,Refseq ID,Gene ID,Gene Symbol,Species ID,UTR sequence
0,CDR1as,CDR1as,CDR1as,10090,UAUCUA-GGGUUUCCAGUGGUGCCAGUACCAAGGUCUUCCAACAUC...
1,ENSMUST00000000001.4,ENSMUSG00000000001.4,Gnai3,10090,GAGGAUG-GC---AU----------AG-U-----------A-AAAG...
2,ENSMUST00000000003.8,ENSMUSG00000000003.10,Pbsn,10090,UGACUCAACAAGAUCAGGAUUAGCAUUACAGAUGACAUCAGGAAUU...
3,ENSMUST00000000010.8,ENSMUSG00000020875.9,Hoxb9,10090,AGA--UGACC------------------------------------...
4,ENSMUST00000000028.8,ENSMUSG00000000028.9,Cdc45,10090,GGAAUUC--AACUU-CU-CCAGAAGUGA------CCUCCUUUUCCU...
...,...,...,...,...,...
24893,ENSMUST00000191553.1,ENSMUSG00000095606.2,Gm21258,10090,GAUAUGCUUUUU---GGUGUAGAUGGUGAACUGAGAAAAAAAAAUG...
24894,ENSMUST00000191565.1,ENSMUSG00000047528.8,Als2cr12,10090,GUA-ACUAACUGCUC-CUGGG-GCUG----AGC----GACACUGCC...
24895,ENSMUST00000191577.1,ENSMUSG00000042451.7,Mybph,10090,AGA-GCAACAGA------------GG------C--AGAGAC---UG...
24896,ENSMUST00000191598.1,ENSMUSG00000024038.11,Ndufv3,10090,ACUUC---UGAUGCUUUGCUCAGGAAGGGCAAAC-CC---------...


In [4]:
macrophage_riboseq_hkazan_genes = read_genes(MARCOPHAGE_RIBOSEQ_HKAZAN_GENES_PATH, unique=True)

print(f"Number of genes: {len(macrophage_riboseq_hkazan_genes)}")

macrophage_riboseq_hkazan_genes[:5]

Number of genes: 22301


['ENSMUSG00000000001',
 'ENSMUSG00000000003',
 'ENSMUSG00000000028',
 'ENSMUSG00000000037',
 'ENSMUSG00000000049']

In [5]:
table, error_genes = construct_table(
    input_genes=macrophage_riboseq_hkazan_genes,
    UTR_seq_data=UTR_seq_mouse_data,
    motifs=MOTIF_LIST
)

  0%|          | 0/22301 [00:00<?, ?it/s]

Number of genes whose UTR sequence not found: 2031


In [6]:
table

Unnamed: 0,GENE_SYMBOL,GENE_ENSMUSG,SEQUENCE,SEQUENCE_LENGTH,AUUUA,AUUUA_normalized,UAUUUAU,UAUUUAU_normalized
0,Gnai3,ENSMUSG00000000001,GAGGAUGGCAUAGUAAAAGCUAUUACAGGGAGGAGUGUUGAGACCA...,3332,5,0.001501,0,0.000000
1,Pbsn,ENSMUSG00000000003,UGACUCAACAAGAUCAGGAUUAGCAUUACAGAUGACAUCAGGAAUU...,240,2,0.008333,1,0.004167
2,Cdc45,ENSMUSG00000000028,GGAAUUCAACUUCUCCAGAAGUGACCUCCUUUUCCUUAUUUAUAUU...,133,1,0.007519,1,0.007519
3,Scml2,ENSMUSG00000000037,UGGGAAAGCUCUGCUACUACUGAAGAGUGAGCUGCUGAUGAAGUAU...,1640,3,0.001829,2,0.001220
4,Apoh,ENSMUSG00000000049,AGUCGUUUCCAGAAUCAAAAUUGAAUGUCGUAUUUGUAUUUCUGCU...,115,0,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...
20265,Bcl2a1a,ENSMUSG00000102037,CACUACGGACCAGAGAGAACCUUGGCUCCCACGAGAAUGGUUAUCA...,149,0,0.000000,0,0.000000
20266,CH36-205C2.2,ENSMUSG00000102045,GAUAUGCUUUUUGGUGUAGAUGGUGAACUGAGAAAAAAAAAUGUCU...,192,0,0.000000,0,0.000000
20267,Gm4064,ENSMUSG00000102053,AGUGGAGGCAGGUACUAAAGUCUUAAGUUGUGUACCAAACAGCAUU...,361,2,0.005540,0,0.000000
20268,Olfr1034,ENSMUSG00000102091,AAUCAAAACAUAUUUUAGAAGCGUGUUUAUCCAAAUGUGUUGAACA...,1342,5,0.003726,0,0.000000


In [7]:
error_genes[:3]

['ENSMUSG00000006638', 'ENSMUSG00000007457', 'ENSMUSG00000009047']

In [8]:
from datetime import datetime
file_date = datetime.now().strftime("%Y-%m-%d")

writer = pd.ExcelWriter(f'../out/App_2_1/macrophage_riboseq_hkazan_analysis_{file_date}.xlsx')

table.to_excel(writer, index=False)

writer.save()

In [9]:
with open(f"../out/App_2_1/UTR_missing_genes{file_date}.txt", "w") as fin:
    for error_gene in error_genes:
        fin.write(f"{error_gene}\n")


In [5]:
table, error_genes = construct_table(
    input_genes=macrophage_riboseq_hkazan_genes,
    UTR_seq_data=UTR_seq_mouse_data,
    motifs=MOTIF_LIST
)

  0%|          | 0/22301 [00:00<?, ?it/s]

Gene ENSMUSG00000006638 is not found.
Gene ENSMUSG00000007457 is not found.
Gene ENSMUSG00000009047 is not found.
Gene ENSMUSG00000018451 is not found.
Gene ENSMUSG00000019933 is not found.
Gene ENSMUSG00000020168 is not found.
Gene ENSMUSG00000021879 is not found.
Gene ENSMUSG00000023259 is not found.
Gene ENSMUSG00000023984 is not found.
Gene ENSMUSG00000028407 is not found.
Gene ENSMUSG00000028475 is not found.
Gene ENSMUSG00000029184 is not found.
Gene ENSMUSG00000029541 is not found.
Gene ENSMUSG00000029720 is not found.
Gene ENSMUSG00000030068 is not found.
Gene ENSMUSG00000030194 is not found.
Gene ENSMUSG00000030604 is not found.
Gene ENSMUSG00000030804 is not found.
Gene ENSMUSG00000032987 is not found.
Gene ENSMUSG00000033036 is not found.
Gene ENSMUSG00000033343 is not found.
Gene ENSMUSG00000034185 is not found.
Gene ENSMUSG00000034677 is not found.
Gene ENSMUSG00000035262 is not found.
Gene ENSMUSG00000036322 is not found.
Gene ENSMUSG00000036500 is not found.
Gene ENSMUSG

In [7]:
table

Unnamed: 0,GENE_ENSMUSG,SEQUENCE,SEQUENCE_LENGTH,AUUUA,AUUUA_normalized,UAUUUAU,UAUUUAU_normalized
0,ENSMUSG00000000001,GAGGAUGGCAUAGUAAAAGCUAUUACAGGGAGGAGUGUUGAGACCA...,3332,5,0.001501,0,0.000000
1,ENSMUSG00000000003,UGACUCAACAAGAUCAGGAUUAGCAUUACAGAUGACAUCAGGAAUU...,240,2,0.008333,1,0.004167
2,ENSMUSG00000000028,GGAAUUCAACUUCUCCAGAAGUGACCUCCUUUUCCUUAUUUAUAUU...,133,1,0.007519,1,0.007519
3,ENSMUSG00000000037,UGGGAAAGCUCUGCUACUACUGAAGAGUGAGCUGCUGAUGAAGUAU...,1640,3,0.001829,2,0.001220
4,ENSMUSG00000000049,AGUCGUUUCCAGAAUCAAAAUUGAAUGUCGUAUUUGUAUUUCUGCU...,115,0,0.000000,0,0.000000
...,...,...,...,...,...,...,...
20265,ENSMUSG00000102037,CACUACGGACCAGAGAGAACCUUGGCUCCCACGAGAAUGGUUAUCA...,149,0,0.000000,0,0.000000
20266,ENSMUSG00000102045,GAUAUGCUUUUUGGUGUAGAUGGUGAACUGAGAAAAAAAAAUGUCU...,192,0,0.000000,0,0.000000
20267,ENSMUSG00000102053,AGUGGAGGCAGGUACUAAAGUCUUAAGUUGUGUACCAAACAGCAUU...,361,2,0.005540,0,0.000000
20268,ENSMUSG00000102091,AAUCAAAACAUAUUUUAGAAGCGUGUUUAUCCAAAUGUGUUGAACA...,1342,5,0.003726,0,0.000000


Unnamed: 0,Refseq ID,Gene ID,Gene Symbol,Species ID,UTR sequence
0,CDR1as,CDR1as,CDR1as,10090,UAUCUA-GGGUUUCCAGUGGUGCCAGUACCAAGGUCUUCCAACAUC...
1,ENSMUST00000000001.4,ENSMUSG00000000001.4,Gnai3,10090,GAGGAUG-GC---AU----------AG-U-----------A-AAAG...
2,ENSMUST00000000003.8,ENSMUSG00000000003.10,Pbsn,10090,UGACUCAACAAGAUCAGGAUUAGCAUUACAGAUGACAUCAGGAAUU...
3,ENSMUST00000000010.8,ENSMUSG00000020875.9,Hoxb9,10090,AGA--UGACC------------------------------------...
4,ENSMUST00000000028.8,ENSMUSG00000000028.9,Cdc45,10090,GGAAUUC--AACUU-CU-CCAGAAGUGA------CCUCCUUUUCCU...
...,...,...,...,...,...
24893,ENSMUST00000191553.1,ENSMUSG00000095606.2,Gm21258,10090,GAUAUGCUUUUU---GGUGUAGAUGGUGAACUGAGAAAAAAAAAUG...
24894,ENSMUST00000191565.1,ENSMUSG00000047528.8,Als2cr12,10090,GUA-ACUAACUGCUC-CUGGG-GCUG----AGC----GACACUGCC...
24895,ENSMUST00000191577.1,ENSMUSG00000042451.7,Mybph,10090,AGA-GCAACAGA------------GG------C--AGAGAC---UG...
24896,ENSMUST00000191598.1,ENSMUSG00000024038.11,Ndufv3,10090,ACUUC---UGAUGCUUUGCUCAGGAAGGGCAAAC-CC---------...


In [39]:
MARCOPHAGE_RIBOSEQ_HKAZAN_GENES_PATH = "../data/RPF_genes/macrophage_riboseq_hkazan_genes_2022-02-10.txt"

In [45]:
from helpers.utils import read_genes

macrophage_riboseq_hkazan_genes = read_genes(MARCOPHAGE_RIBOSEQ_HKAZAN_GENES_PATH)

print(f"Number of genes: {len(macrophage_riboseq_hkazan_genes)}")

macrophage_riboseq_hkazan_genes[:5]

Number of genes: 22654


['ENSMUSG00000051951',
 'ENSMUSG00000025900',
 'ENSMUSG00000025902',
 'ENSMUSG00000002459',
 'ENSMUSG00000025905']

In [None]:
file_name = "macrophage_riboseq_hkazan_genes"


In [44]:
table_macrophage_riboseq_hkazan_genes = construct_table(
    input_genes=macrophage_riboseq_hkazan_genes,
    pred_target_info_data=predicted_target_info_mouse_data,
    UTR_seq_data=UTR_seq_mouse_data,
    gene_mapping=ENSMUSG_to_gene_symbol,
    representative_transcripts=miR_223_representative_transcripts,
    motifs=MOTIF_LIST
)

KeyError: 'ENSMUSG00000051951'

In [51]:
filenames_to_dataframes = {}
for filename, input_genes in match_file_to_genes.items():
    print(f"PROCESSING {filename} with {len(input_genes)} genes.")
    if len(input_genes) == 0:
        print(" [warning] No gene found, skipping ..")
        print("=======================================================")
        continue

    table = construct_table(
        input_genes=input_genes,
        pred_target_info_data=predicted_target_info_mouse_data,
        UTR_seq_data=UTR_seq_mouse_data,
        gene_mapping=ENSMUSG_to_gene_symbol,
        representative_transcripts=miR_223_representative_transcripts,
        motifs=MOTIF_LIST
    )
    filenames_to_dataframes[filename] = table
    print("=======================================================")

PROCESSING macrophage_riboseq_hkazan with 319 genes.


In [52]:
filenames_to_dataframes.keys()

dict_keys(['macrophage_riboseq_hkazan'])

In [54]:
filenames_to_dataframes["macrophage_riboseq_hkazan"]

Unnamed: 0,GENE_NAME,GENE_ENSMUSG,#_SITE,SITE_POSITIONS,SEQUENCE,SEQUENCE_LENGTH,AUUUA,AUUUA_normalized,UAUUUAU,UAUUUAU_normalized
0,Cbx5,ENSMUSG00000009575,1,[7984:7991],AGGAGGAGGCAGUCUCUGUCGUUUCUCUUUGUAUAUAAUACCUUUA...,8097,7,0.000865,1,0.000124
1,Rc3h1,ENSMUSG00000040423,1,[6995:7001],AAUAUGGGCACCUGCUUCUACCUUCUGCUCCUAAUCAGUUCAUGGG...,7181,20,0.002785,2,0.000279
2,Armc1,ENSMUSG00000027599,1,[869:876],CUUCAACUUCAGUUUGGGGCUCAAGGACUGUGCAAACCAACAGGGG...,8883,18,0.002026,4,0.000450
3,Fbxo25,ENSMUSG00000038365,2,"[1231:1238, 324:331]",UGGUGCACUCAGCCCCGUGCCCUUGUCUACAGCUGUCCCUUGUGCU...,6636,7,0.001055,1,0.000151
4,Smlr1,ENSMUSG00000096546,1,[1158:1165],AGGUGGGGGUUGGUGUCACCAAUGGGCAAAGCACCAUGAAUUUCUC...,2164,1,0.000462,0,0.000000
...,...,...,...,...,...,...,...,...,...,...
314,Plce1,ENSMUSG00000024998,1,[998:1004],GCAAGUGCGGCAUAUUUGUUCCAGGGAUAUGAAGGGGGAGGUCCUU...,3553,7,0.001970,2,0.000563
315,Scn1a,ENSMUSG00000064329,1,[90:96],ACAAAAACAACAAAAAAAAAUAAUAAAUUGGGUGACAAAUUGUUUA...,2104,6,0.002852,3,0.001426
316,Kcnq1,ENSMUSG00000009545,1,[172:178],CACAACUGGACCAGAGACUGGUGAUCAUCACAGACAUGCUCCACCA...,4914,3,0.000611,0,0.000000
317,Phlpp1,ENSMUSG00000044340,1,[620:626],CCCAGCCUGAGUACUGUUUUAAACAAUAAACUAACCAGAGAGACUG...,3650,6,0.001644,0,0.000000


In [56]:
from datetime import datetime
file_date = datetime.now().strftime("%Y-%m-%d")

writer = pd.ExcelWriter(f'../out/macrophage_riboseq_hkazan_analysis_{file_date}.xlsx')

for filename in filenames_to_dataframes.keys():
    filenames_to_dataframes[filename].to_excel(writer, filename, index=False)

writer.save()
writer.close()

  warn("Calling close() on already closed file.")


---