In [1]:
# Generate tables of homologous genes 
# Gene conversion strategy adapted from: https://github.com/Papatheodorou-Group/BENGAL/blob/main/bin/concat_by_homology_multiple_species_by_gene_id.R

In [2]:
import pybiomart
import pandas as pd
import numpy as np

print(pybiomart.__version__)
print(pd.__version__)
print(np.__version__)

0.2.0
1.3.5
1.21.6




In [3]:
mouse = pybiomart.Dataset(name='mmusculus_gene_ensembl', host='http://www.ensembl.org')
human = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')


In [4]:
genes_main_chr = {'hsapiens':list(map(str,list(range(1,23))+['X','Y'])),
                  'mmusculus':list(map(str,list(range(1,20))+['X','Y'])),
                  'drerio':list(map(str,list(range(1,26))))}

In [5]:
def create_table(species1, species0, pybio, fname):
    #### Input
    #species1: species whose names we will convert
    #species0: reference - convert species1 names to species0 names
    #pybio: species0's pybiomart variable
    #fname: filename to save homology table to
    #output = save csv of homology table
    #### 
    
    ######### Filter dataframe to include information from species of interest
    species1_colnames = pybio.list_attributes()['name'][pybio.list_attributes()['name'].str.contains(species1)]

    df = pybio.query(attributes=["ensembl_gene_id", "external_gene_name", 
                                       "chromosome_name", "start_position", "end_position"]+
                           list(species1_colnames))

    
    ########## Keep the name instead of 'display_name'
    attribute_df = pybio.list_attributes()
    attribute_dict = dict(zip(attribute_df.display_name,attribute_df.name))
    df.rename(columns=attribute_dict, inplace=True)

    ######### Filter for chromosomes in main chromosomes for species0
    df[df['chromosome_name'].isin(genes_main_chr[species0])]
    
    ######### Renaming the info from baseline species so it contains species name
    df[species0 + "_homolog_associated_gene_name"] = df['external_gene_name']
    df[species0 + "_homolog_ensembl_gene"] = df['ensembl_gene_id']
    df[species0 + "_homolog_chromosome"] = df['chromosome_name']
    df[species0 + "_homolog_chrom_start"] = df['start_position']
    df[species0 + "_homolog_chrom_end"] = df['end_position']

    ########## Filter to make 1:1 conversion table
    df = df.dropna(subset=[species1+'_homolog_associated_gene_name'])#drop if na in zf genes 
    df = df.dropna(subset=[species0+'_homolog_associated_gene_name'])#drop if na in hu genes 

    # Sort genes based on homology
    df.sort_values(by=species1+"_homolog_goc_score", ascending=False, inplace=True)
    df.sort_values(by=species1+"_homolog_orthology_confidence", ascending=False, inplace=True) #primary sort criteria

    # Drop genes which are pairs are duplicated
    df = df.drop_duplicates(subset = [species1+'_homolog_associated_gene_name',
                                  species0+'_homolog_associated_gene_name'],
                  keep='first') #keep first which has highest homology
    
    # Drop genes duplicated in single species 
    df = df.drop_duplicates(subset = [species1+'_homolog_associated_gene_name'],
                      keep='first') #keep first which has highest homology
    df = df.drop_duplicates(subset = [species0+'_homolog_associated_gene_name'],
                      keep='first') #keep first which has highest homology
    
    df.to_csv(fname)

In [6]:
#zebrafish to mouse
species1 = 'drerio' #species whose names we will convert
species0 = 'mmusculus' #reference - convert to these names
pybio = mouse #species0's pybiomart 
fname = 'homology_tbl_'+species0+'_'+species1+'-20231117.csv'

create_table(species1, species0, pybio, fname)

In [7]:
#zebrafish to human
species1 = 'drerio' #species whose names we will convert
species0 = 'hsapiens' #reference - convert to these names
pybio = human #species0's pybiomart 
fname = 'homology_tbl_'+species0+'_'+species1+'-20231117.csv'

create_table(species1, species0, pybio, fname)

  app.launch_new_instance()


In [8]:
#mouse to human
species1 = 'mmusculus' #species whose names we will convert
species0 = 'hsapiens' #reference - convert to these names
pybio = human #species0's pybiomart 
fname = 'homology_tbl_'+species0+'_'+species1+'-20231117.csv'

create_table(species1, species0, pybio, fname)

  app.launch_new_instance()
