In [None]:
# Convert gene signatures from mouse to human gene names

In [None]:
import pybiomart
import pandas as pd
import numpy as np
import glob

In [None]:
# import mart data
human = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
mouse = pybiomart.Dataset(name='mmusculus_gene_ensembl', host='http://www.ensembl.org')


In [None]:
# generate dataframes with mouse and human ensemble gene IDs, gene names, and homologous gene names (1 df per species)
mouse_df = mouse.query(attributes=['ensembl_gene_id', 'external_gene_name', 'hsapiens_homolog_ensembl_gene'])
human_df = human.query(attributes=['ensembl_gene_id', 'external_gene_name','mmusculus_homolog_ensembl_gene'])

human_df.head()

In [None]:
# From mouse list, remove genes with no human gene stable ID (have nan)
mouse_df.dropna(axis=0,subset=['Human gene stable ID'],inplace=True)

In [None]:
def mouse2human(mouse_input):
    #not always a 1:1 mapping
    #if there are multiple human genes per 1 mouse gene, include all [e.g. Klk1b26 has 3 human equivalents in human_df]
    #if there is not a human equivalent, skip that gene [e.g. Ifi27l2b has no human equivalent in human_df]
    #if there are human genes with multiple mouse homologues, get replicates of same thing so combine to remove duplicates [e.g. human CES2]
    
    human_ensembl_list = mouse_df.loc[mouse_df['Gene name']==mouse_input,'Human gene stable ID'].to_numpy() #ensemble ID of human homologue
    human_ensembl_list = np.unique(human_ensembl_list)
    
    human_gene_superlist = []
    
    for human_ensembl in human_ensembl_list:
        human_gene_list = human_df.loc[human_df['Gene stable ID']==human_ensembl, 'Gene name'].to_numpy()
        human_gene_list = np.unique(human_gene_list) 
        human_gene_superlist=np.append(human_gene_superlist, human_gene_list)
    
    return human_gene_superlist
    

In [None]:
file_list = glob.glob("/text_outputs/*.txt")

for file in file_list:
    df = pd.read_csv(file, header=None,names=['mouse_gene'])
    
    human_genes = []
    for gene in df['mouse_gene']:
        human_genes=np.append(human_genes,mouse2human(gene))

    file_out = file.split('/')[-1][:-4]+"_humanversions.txt"
    pd.DataFrame(human_genes).to_csv("/text_outputs/signatures_human_conversions/"+file_out,index=False, header=False)
