# Match the genes with given gene list

In [8]:
import os
import os.path as op
import glob

import pandas as pd

In [15]:
def read_genes(file_name):
    with open(file_name) as file:
        genes = [line.strip() for line in file.readlines()]

    return genes

In [12]:
gene_list_files = glob.glob("..\\gen_lists\\*")
gene_list_files

['..\\gen_lists\\rna_m0_down.txt',
 '..\\gen_lists\\rna_m0_up.txt',
 '..\\gen_lists\\rna_m1_down.txt',
 '..\\gen_lists\\rna_m1_up.txt',
 '..\\gen_lists\\rpf_m0_down.txt',
 '..\\gen_lists\\rpf_m0_up.txt',
 '..\\gen_lists\\rpf_m1_down.txt',
 '..\\gen_lists\\rpf_m1_up.txt']

In [17]:
# Load the genes
gene_list_files_to_genes = {}
for gene_list_file in gene_list_files:
    gene_list_files_to_genes[gene_list_file] = read_genes(gene_list_file)

In [23]:
# example
print(gene_list_files[-1])
gene_list_files_to_genes[gene_list_files[-1]]

..\gen_lists\rpf_m1_up.txt


['ENSMUSG00000033740',
 'ENSMUSG00000025917',
 'ENSMUSG00000056763',
 'ENSMUSG00000025920',
 'ENSMUSG00000025779',
 'ENSMUSG00000026110',
 'ENSMUSG00000026088',
 'ENSMUSG00000026087',
 'ENSMUSG00000026082',
 'ENSMUSG00000073702',
 'ENSMUSG00000010290',
 'ENSMUSG00000002881',
 'ENSMUSG00000026102',
 'ENSMUSG00000026095',
 'ENSMUSG00000026095',
 'ENSMUSG00000026095',
 'ENSMUSG00000026095',
 'ENSMUSG00000025980',
 'ENSMUSG00000026031',
 'ENSMUSG00000026024',
 'ENSMUSG00000026020',
 'ENSMUSG00000073664',
 'ENSMUSG00000025959',
 'ENSMUSG00000070871',
 'ENSMUSG00000039354',
 'ENSMUSG00000026171',
 'ENSMUSG00000026198',
 'ENSMUSG00000032997',
 'ENSMUSG00000089844',
 'ENSMUSG00000026222',
 'ENSMUSG00000049608',
 'ENSMUSG00000062590',
 'ENSMUSG00000044783',
 'ENSMUSG00000026260',
 'ENSMUSG00000026279',
 'ENSMUSG00000040648',
 'ENSMUSG00000062345',
 'ENSMUSG00000098923',
 'ENSMUSG00000026384',
 'ENSMUSG00000026388',
 'ENSMUSG00000026390',
 'ENSMUSG00000045382',
 'ENSMUSG00000026409',
 'ENSMUSG00

In [32]:
" ".join(gene_list_files_to_genes[gene_list_files[-1]][:3])

'ENSMUSG00000033740 ENSMUSG00000025917 ENSMUSG00000056763'

In [92]:
len(gene_list_files_to_genes[gene_list_files[-1]])

1148

In [75]:
# Read MiRNA 223 MirTarBase genes
miRNA_223_mirtarbase_genes = [gene.upper() for gene in read_genes("target_genes_mouse_miRNA_223_2022-01-10.txt")]
len(miRNA_223_mirtarbase_genes)

311

In [76]:
# Convert MiRNA 223 MirTarBase genes to ENS ids.

In [77]:
miRNA_223_mirtarbase_genes_mapping_table = pd.read_csv("ENSMUS_converted.csv", index_col=0)
miRNA_223_mirtarbase_genes_mapping_table["genes"] = miRNA_223_mirtarbase_genes_mapping_table["genes"].str.upper()
miRNA_223_mirtarbase_genes_mapping_table = miRNA_223_mirtarbase_genes_mapping_table[
    (miRNA_223_mirtarbase_genes_mapping_table["ensembl_gene_id"].notna())
]

miRNA_223_mirtarbase_genes_mapping_table

Unnamed: 0,genes,hgnc_symbol,ensembl_gene_id,entrezgene_id,gene_biotype
4,4932438A13RIK,,ENSMUSG00000037270,229227.0,protein_coding
6,ABCA1,,ENSMUSG00000015243,11303.0,protein_coding
7,ABHD13,,ENSMUSG00000040396,68904.0,protein_coding
8,ACAP2,,ENSMUSG00000049076,78618.0,protein_coding
9,ADCY1,,ENSMUSG00000020431,432530.0,protein_coding
...,...,...,...,...,...
307,ZFP704,,ENSMUSG00000040209,170753.0,protein_coding
308,ZFP91,,ENSMUSG00000024695,109910.0,protein_coding
309,ZHX3,,ENSMUSG00000035877,320799.0,protein_coding
310,ZMYND8,,ENSMUSG00000039671,228880.0,protein_coding


In [81]:
miRNA_223_mirtarbase_genes_ens = miRNA_223_mirtarbase_genes_mapping_table["ensembl_gene_id"].unique().tolist()
len(miRNA_223_mirtarbase_genes_ens)

299

# Match

In [89]:
def match(L1, L2):
    matched_items = set(L1).intersection(set(L2))
    print(f"Number of matched items {len(matched_items)}")
    print(f"\t{matched_items}")

In [90]:
match(miRNA_223_mirtarbase_genes_ens, [])

Number of matched items 0
	set()


In [91]:
for gene_list_file in gene_list_files:
    print(gene_list_file)
    match(gene_list_files_to_genes[gene_list_file], miRNA_223_mirtarbase_genes_ens)
    print("\n")

..\gen_lists\rna_m0_down.txt
Number of matched items 0
	set()


..\gen_lists\rna_m0_up.txt
Number of matched items 1
	{'ENSMUSG00000021643'}


..\gen_lists\rna_m1_down.txt
Number of matched items 2
	{'ENSMUSG00000021643', 'ENSMUSG00000019861'}


..\gen_lists\rna_m1_up.txt
Number of matched items 0
	set()


..\gen_lists\rpf_m0_down.txt
Number of matched items 24
	{'ENSMUSG00000001089', 'ENSMUSG00000006641', 'ENSMUSG00000040407', 'ENSMUSG00000037788', 'ENSMUSG00000043909', 'ENSMUSG00000063077', 'ENSMUSG00000034574', 'ENSMUSG00000000804', 'ENSMUSG00000030094', 'ENSMUSG00000040865', 'ENSMUSG00000029338', 'ENSMUSG00000020993', 'ENSMUSG00000026754', 'ENSMUSG00000040459', 'ENSMUSG00000036093', 'ENSMUSG00000028567', 'ENSMUSG00000035834', 'ENSMUSG00000036099', 'ENSMUSG00000024782', 'ENSMUSG00000051586', 'ENSMUSG00000032826', 'ENSMUSG00000029924', 'ENSMUSG00000042271', 'ENSMUSG00000037720'}


..\gen_lists\rpf_m0_up.txt
Number of matched items 31
	{'ENSMUSG00000037217', 'ENSMUSG00000030465', 'ENS