# Mouse miR-223 Program

In [1]:
# Reflect changes in the modules immediately.

# @formatter:off
%load_ext autoreload
%autoreload 2
# @formatter:on

In [2]:
import os
import pandas as pd
from helpers.utils import construct_table
import glob

UTR_SEQUENCE_MOUSE_DATA_PATH = "../data/UTR_Sequences/UTR_sequences_mouse.txt"
PREDICTED_TARGET_INFO_MOUSE_DATA_PATH = "../data/predicted_targets_info/Predicted_Targets_Info_Mouse_miR-223.txt"
PREDICTED_TARGET_REPRESENTATIVE_DATA_PATH = "../data/predicted_targets/TargetScanMouse_8.0__miR-223-3p.predicted_targets.txt"

In [3]:
predicted_target_representative_mouse_data = pd.read_csv(PREDICTED_TARGET_REPRESENTATIVE_DATA_PATH, sep="\t")
miR_223_representative_transcripts = predicted_target_representative_mouse_data["Representative transcript"].tolist()
miR_223_representative_transcripts = [transcript.split(".")[0] for transcript in miR_223_representative_transcripts]
miR_223_representative_transcripts[:3]

['ENSMUST00000029535', 'ENSMUST00000067384', 'ENSMUST00000067077']

In [4]:
MOTIF_LIST = [
    "AUUUA",
    "UAUUUAU",
]

In [5]:
UTR_seq_mouse_data = pd.read_csv(UTR_SEQUENCE_MOUSE_DATA_PATH, sep="\t")
UTR_seq_mouse_data

Unnamed: 0,Refseq ID,Gene ID,Gene Symbol,Species ID,UTR sequence
0,CDR1as,CDR1as,CDR1as,10090,UAUCUA-GGGUUUCCAGUGGUGCCAGUACCAAGGUCUUCCAACAUC...
1,ENSMUST00000000001.4,ENSMUSG00000000001.4,Gnai3,10090,GAGGAUG-GC---AU----------AG-U-----------A-AAAG...
2,ENSMUST00000000003.8,ENSMUSG00000000003.10,Pbsn,10090,UGACUCAACAAGAUCAGGAUUAGCAUUACAGAUGACAUCAGGAAUU...
3,ENSMUST00000000010.8,ENSMUSG00000020875.9,Hoxb9,10090,AGA--UGACC------------------------------------...
4,ENSMUST00000000028.8,ENSMUSG00000000028.9,Cdc45,10090,GGAAUUC--AACUU-CU-CCAGAAGUGA------CCUCCUUUUCCU...
...,...,...,...,...,...
24893,ENSMUST00000191553.1,ENSMUSG00000095606.2,Gm21258,10090,GAUAUGCUUUUU---GGUGUAGAUGGUGAACUGAGAAAAAAAAAUG...
24894,ENSMUST00000191565.1,ENSMUSG00000047528.8,Als2cr12,10090,GUA-ACUAACUGCUC-CUGGG-GCUG----AGC----GACACUGCC...
24895,ENSMUST00000191577.1,ENSMUSG00000042451.7,Mybph,10090,AGA-GCAACAGA------------GG------C--AGAGAC---UG...
24896,ENSMUST00000191598.1,ENSMUSG00000024038.11,Ndufv3,10090,ACUUC---UGAUGCUUUGCUCAGGAAGGGCAAAC-CC---------...


In [6]:
predicted_target_info_mouse_data = pd.read_csv(PREDICTED_TARGET_INFO_MOUSE_DATA_PATH, sep="\t")
predicted_target_info_mouse_data

Unnamed: 0,miR Family,Gene ID,Gene Symbol,Transcript ID,Species ID,UTR start,UTR end,MSA start,MSA end,Seed match,PCT
0,miR-223-3p,ENSMUSG00000035401.8,2210018M11Rik,ENSMUST00000038359.4,10090,1181,1187,2140,2151,7mer-1a,0.45
1,miR-223-3p,ENSMUSG00000026319.8,2310035C23Rik,ENSMUST00000086721.4,10090,484,490,715,723,7mer-m8,0.32
2,miR-223-3p,ENSMUSG00000073741.1,4732440D04Rik,ENSMUST00000097832.1,10090,1239,1246,2372,2379,8mer,0.09
3,miR-223-3p,ENSMUSG00000034959.7,5031414D18Rik,ENSMUST00000036072.7,10090,335,342,822,829,8mer,0.48
4,miR-223-3p,ENSMUSG00000070690.4,5830473C10Rik,ENSMUST00000094615.4,10090,2749,2756,3767,3774,8mer,0.00
...,...,...,...,...,...,...,...,...,...,...,...
339,miR-223-3p,ENSMUSG00000061410.6,Zcchc14,ENSMUST00000046386.4,10090,1680,1687,2860,2871,8mer,0.49
340,miR-223-3p,ENSMUSG00000024238.8,Zeb1,ENSMUST00000025081.6,10090,1245,1251,1936,1944,7mer-1a,0.44
341,miR-223-3p,ENSMUSG00000024750.5,Zfand5,ENSMUST00000025659.5,10090,17515,17521,21933,21939,7mer-1a,0.42
342,miR-223-3p,ENSMUSG00000038872.9,Zfhx3,ENSMUST00000043896.9,10090,884,890,1587,1593,7mer-m8,0.46


# App

In [7]:
mapping_data = predicted_target_info_mouse_data.copy()
mapping_data["Gene ID"] = mapping_data["Gene ID"].apply(lambda x: x.split(".")[0])
ENSMUSG_to_gene_symbol = mapping_data.set_index("Gene ID").to_dict()["Gene Symbol"]
ENSMUSG_to_gene_symbol

{'ENSMUSG00000035401': '2210018M11Rik',
 'ENSMUSG00000026319': '2310035C23Rik',
 'ENSMUSG00000073741': '4732440D04Rik',
 'ENSMUSG00000034959': '5031414D18Rik',
 'ENSMUSG00000070690': '5830473C10Rik',
 'ENSMUSG00000040396': 'Abhd13',
 'ENSMUSG00000028405': 'Aco1',
 'ENSMUSG00000032883': 'Acsl3',
 'ENSMUSG00000025228': 'Actr1a',
 'ENSMUSG00000052155': 'Acvr2a',
 'ENSMUSG00000053399': 'Adamts18',
 'ENSMUSG00000031659': 'Adcy7',
 'ENSMUSG00000030232': 'Aebp2',
 'ENSMUSG00000050103': 'Agmo',
 'ENSMUSG00000028842': 'Ago3',
 'ENSMUSG00000022636': 'Alcam',
 'ENSMUSG00000055204': 'Ankrd17',
 'ENSMUSG00000020864': 'Ankrd40',
 'ENSMUSG00000058589': 'Anks1b',
 'ENSMUSG00000025525': 'Apool',
 'ENSMUSG00000074513': 'Arfip1',
 'ENSMUSG00000035133': 'Arhgap5',
 'ENSMUSG00000007880': 'Arid1a',
 'ENSMUSG00000027599': 'Armc1',
 'ENSMUSG00000033460': 'Armcx1',
 'ENSMUSG00000007656': 'Arpp19',
 'ENSMUSG00000000325': 'Arvcf',
 'ENSMUSG00000026576': 'Atp1b1',
 'ENSMUSG00000019943': 'Atp2b1',
 'ENSMUSG0000005

In [8]:
ENSMUSG_to_gene_symbol["ENSMUSG00000039652"]

'Cpeb3'

In [9]:
predicted_target_info_mouse_data

Unnamed: 0,miR Family,Gene ID,Gene Symbol,Transcript ID,Species ID,UTR start,UTR end,MSA start,MSA end,Seed match,PCT
0,miR-223-3p,ENSMUSG00000035401.8,2210018M11Rik,ENSMUST00000038359.4,10090,1181,1187,2140,2151,7mer-1a,0.45
1,miR-223-3p,ENSMUSG00000026319.8,2310035C23Rik,ENSMUST00000086721.4,10090,484,490,715,723,7mer-m8,0.32
2,miR-223-3p,ENSMUSG00000073741.1,4732440D04Rik,ENSMUST00000097832.1,10090,1239,1246,2372,2379,8mer,0.09
3,miR-223-3p,ENSMUSG00000034959.7,5031414D18Rik,ENSMUST00000036072.7,10090,335,342,822,829,8mer,0.48
4,miR-223-3p,ENSMUSG00000070690.4,5830473C10Rik,ENSMUST00000094615.4,10090,2749,2756,3767,3774,8mer,0.00
...,...,...,...,...,...,...,...,...,...,...,...
339,miR-223-3p,ENSMUSG00000061410.6,Zcchc14,ENSMUST00000046386.4,10090,1680,1687,2860,2871,8mer,0.49
340,miR-223-3p,ENSMUSG00000024238.8,Zeb1,ENSMUST00000025081.6,10090,1245,1251,1936,1944,7mer-1a,0.44
341,miR-223-3p,ENSMUSG00000024750.5,Zfand5,ENSMUST00000025659.5,10090,17515,17521,21933,21939,7mer-1a,0.42
342,miR-223-3p,ENSMUSG00000038872.9,Zfhx3,ENSMUST00000043896.9,10090,884,890,1587,1593,7mer-m8,0.46


In [10]:
import ast

In [15]:
MATCHED_DATA_PATH = "../TargetScan_summary_match_data_2022-01-13.csv"

In [16]:
summary_match_data = pd.read_csv(MATCHED_DATA_PATH)
summary_match_data['MATCHED_GENES_ENSMUSG'] = summary_match_data['MATCHED_GENES_ENSMUSG'].apply(lambda x: ast.literal_eval(x))
summary_match_data

Unnamed: 0,GENE_LIST_FILENAME,NUM_MATCHED,MATCHED_GENES_ENSMUSG
0,rna_m0_down.txt,0,{}
1,rna_m0_up.txt,1,{ENSMUSG00000074622}
2,rna_m1_down.txt,3,"{ENSMUSG00000032479, ENSMUSG00000025484, ENSMU..."
3,rna_m1_up.txt,0,{}
4,rpf_m0_down.txt,27,"{ENSMUSG00000050103, ENSMUSG00000021952, ENSMU..."
5,rpf_m0_up.txt,56,"{ENSMUSG00000051615, ENSMUSG00000005161, ENSMU..."
6,rpf_m1_down.txt,22,"{ENSMUSG00000036282, ENSMUSG00000050103, ENSMU..."
7,rpf_m1_up.txt,18,"{ENSMUSG00000022773, ENSMUSG00000018398, ENSMU..."


In [17]:
match_file_to_genes = summary_match_data.set_index("GENE_LIST_FILENAME").to_dict()["MATCHED_GENES_ENSMUSG"]
match_file_to_genes

{'rna_m0_down.txt': set(),
 'rna_m0_up.txt': {'ENSMUSG00000074622'},
 'rna_m1_down.txt': {'ENSMUSG00000002068',
  'ENSMUSG00000025484',
  'ENSMUSG00000032479'},
 'rna_m1_up.txt': set(),
 'rpf_m0_down.txt': {'ENSMUSG00000002107',
  'ENSMUSG00000005501',
  'ENSMUSG00000005882',
  'ENSMUSG00000018363',
  'ENSMUSG00000021952',
  'ENSMUSG00000022802',
  'ENSMUSG00000024074',
  'ENSMUSG00000026594',
  'ENSMUSG00000026718',
  'ENSMUSG00000026754',
  'ENSMUSG00000026833',
  'ENSMUSG00000027651',
  'ENSMUSG00000028621',
  'ENSMUSG00000029840',
  'ENSMUSG00000029924',
  'ENSMUSG00000032185',
  'ENSMUSG00000035021',
  'ENSMUSG00000035401',
  'ENSMUSG00000038957',
  'ENSMUSG00000043639',
  'ENSMUSG00000050103',
  'ENSMUSG00000054894',
  'ENSMUSG00000055723',
  'ENSMUSG00000058254',
  'ENSMUSG00000063952',
  'ENSMUSG00000073557',
  'ENSMUSG00000074513'},
 'rpf_m0_up.txt': {'ENSMUSG00000000253',
  'ENSMUSG00000005161',
  'ENSMUSG00000006705',
  'ENSMUSG00000007656',
  'ENSMUSG00000014504',
  'ENSMUS

In [18]:
filenames_to_dataframes = {}
for filename, input_genes in match_file_to_genes.items():
    print(f"PROCESSING {filename} with {len(input_genes)} genes.")
    if len(input_genes) == 0:
        print(" [warning] No gene found, skipping ..")
        print("=======================================================")
        continue

    table = construct_table(
        input_genes=input_genes,
        pred_target_info_data=predicted_target_info_mouse_data,
        UTR_seq_data=UTR_seq_mouse_data,
        gene_mapping=ENSMUSG_to_gene_symbol,
        representative_transcripts=miR_223_representative_transcripts,
        motifs=MOTIF_LIST
    )
    filenames_to_dataframes[filename] = table
    print("=======================================================")

PROCESSING rna_m0_down.txt with 0 genes.
PROCESSING rna_m0_up.txt with 1 genes.
PROCESSING rna_m1_down.txt with 3 genes.
PROCESSING rna_m1_up.txt with 0 genes.
PROCESSING rpf_m0_down.txt with 27 genes.
PROCESSING rpf_m0_up.txt with 56 genes.
PROCESSING rpf_m1_down.txt with 22 genes.
PROCESSING rpf_m1_up.txt with 18 genes.


In [19]:
filenames_to_dataframes.keys()

dict_keys(['rna_m0_up.txt', 'rna_m1_down.txt', 'rpf_m0_down.txt', 'rpf_m0_up.txt', 'rpf_m1_down.txt', 'rpf_m1_up.txt'])

In [25]:
writer = pd.ExcelWriter('TargetScan_matched_genes_analysis_updated.xlsx')

for filename in filenames_to_dataframes.keys():
    filenames_to_dataframes[filename].to_excel(writer, filename, index=False)

writer.save()
writer.close()

  warn("Calling close() on already closed file.")


---

In [None]:
data.to_excel(writer,'result')
writer.save()

In [55]:
filenames_to_dataframes["m0_up.txt"].to_clipboard(index=False)

In [56]:
filenames_to_dataframes["m1_down.txt"].to_clipboard(index=False)

In [57]:
filenames_to_dataframes["pf_m0_down.txt"].to_clipboard(index=False)

In [58]:
filenames_to_dataframes["pf_m0_up.txt"].to_clipboard(index=False)

In [59]:
filenames_to_dataframes["pf_m1_down.txt"].to_clipboard(index=False)

In [60]:
filenames_to_dataframes["pf_m1_up.txt"].to_clipboard(index=False)

In [11]:
construct_table(
    input_genes=["ENSMUSG00000027927", "ENSMUSG00000053897"],
    pred_target_info_data=predicted_target_info_mouse_data,
    UTR_seq_data=UTR_seq_mouse_data,
    gene_mapping=ENSMUSG_to_gene_symbol,
    representative_transcripts=miR_223_representative_transcripts,
    motifs=MOTIF_LIST
)

Unnamed: 0,GENE_NAME,GENE_ENSMUSG,#_SITE,SITE_POSITIONS,AUUUA,UAUUUAU
0,Lelp1,ENSMUSG00000027927,2,"[83:90, 349:356]",0,0
1,Slc39a8,ENSMUSG00000053897,1,[1387:1393],14,1
