In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scanpy as sc
import yaml
from IPython.display import display, HTML

In [2]:
# settings to display all columns + print result
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 1000)

# Generate the database

In [14]:
# Create place to store the database
database = []

# Specify walk_dir
walk_dir = 'auto-annotation-md'

# Counter check for .yaml file (should be 215)
n = 0

# Make walk_dir absolute
walk_dir = os.path.abspath(walk_dir)

print('walk_dir (absolute) = ' + walk_dir)

for root, subdirs, files in os.walk(walk_dir):
    list_file_path = os.path.join(root, 'my-directory-list.txt')
    with open(list_file_path, 'wb') as list_file:
        for filename in files:
            file_path = os.path.join(root, filename)
            
            with open(file_path, 'r') as stream:
                
                if file_path[-5:] == '.yaml':
                    content = stream.read()
                    database.append(list(yaml.load_all(content.split("---", 1)[0])))

walk_dir (absolute) = /Users/jrrrrr/Desktop/annotation/auto-annotation-md




In [15]:
database

[[{'name': 'Glutamatergic 2',
   'abbreviation': '@VGLUT2',
   'definition': '+Slc17a6',
   'go': None,
   'shogoin': None,
   'uberon': None,
   'categories': 'Ectodermal Neuronal Neuroectodermal',
   'version': 1,
   'synonyms': ['Glutamatergic neurotransmission']}],
 [{'name': 'Cholinergic',
   'abbreviation': '@CHOL',
   'definition': '+Chat +Slc5a7',
   'categories': 'Ectodermal Neuronal Neuroectodermal',
   'version': 1,
   'synonyms': ['Cholinergic neurotransmission']}],
 [{'name': 'Glutamatergic 3',
   'abbreviation': '@VGLUT3',
   'definition': '+Slc17a8',
   'go': None,
   'shogoin': None,
   'uberon': None,
   'synonyms': ['Glutamatergic neurotransmission'],
   'categories': 'Ectodermal Neuronal Neuroectodermal'}],
 [{'name': 'Serotonergic',
   'abbreviation': '@SER',
   'definition': '+Tph2 +Fev',
   'categories': 'Ectodermal Neuronal Neuroectodermal',
   'version': 1,
   'synonyms': None}],
 [{'name': 'Adrenergic',
   'abbreviation': 'ADR',
   'definition': '+Dbh +Ddc +Th 

In [16]:
len(database)

215

# Load the table with most expressed genes

In [5]:
table=pd.read_csv('~/Desktop/test.csv')

In [6]:
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Gkn3,Tcf7l2,Otx2,Tfap2b,Gm17750,Igfbpl1,Sox14,Tmsb10,Six3os1,Vim,Sparc,Alas2,C1qc,Kcnj8
1,Snhg11,Lhx9,Lmo1,Malat1,Ebf1,Lhx2,Nfib,Mllt11,Pnoc,Fabp7,Vim,Hbb-bs,C1qa,Rgs5
2,Rtn1,Igfbpl1,H3f3b,Nxph1,Lef1,Rps11,Lhx5,Tuba1a,Six3,Phgdh,Itm2a,Hbb-bt,Trem2,Mmp9
3,Ndn,Tubb5,Adarb2,Gad2,Lmo3,Rpl13a,Tfap2b,Gap43,Npy,Ckb,Serpinh1,Hba-a2,Cx3cr1,Ndufa4l2
4,Ly6h,Sox11,Malat1,Nrxn3,Islr2,Rps6,Gad2,Nnat,Uncx,Plpp3,Rps27,Hba-a1,Pld4,Cald1


# Function for annotation

In [8]:
def annotation_helper(table, database):
    
    # Number of clusters
    number_of_clusters = len(table.columns)

    # Create a table to display results
    result_all = pd.DataFrame()

    for cluster_id in range(number_of_clusters):

        # Collecting list of genes for 'cluster_id' as array
        genes_list = table.iloc[:,cluster_id].values
        genes_number = len(genes_list)

        # Creating a list for search results
        result_cluster = [''] * genes_number
        print('')
        print('Working on cluster: ', cluster_id)
        print('List of genes: '     , genes_list)
        print('Number of genes: '   , genes_number)

        # Search each gene from 'genes_list' in database
        for x in range(len(database)):
            for i in range(genes_number):  

                if '+{}'.format(genes_list[i]) in database[x][0]['definition']:
                    result_cluster[i] += '+{}\n'.format(database[x][0]['abbreviation'])
                    print('Found {} as a + marker for {}'.format(genes_list[i],database[x][0]['name']))

                elif '-{}'.format(genes_list[i]) in database[x][0]['definition']:
                    result_cluster[i] += '-{}\n'.format(database[x][0]['abbreviation'])
                    print('Found {} as a - marker for {}'.format(genes_list[i],database[x][0]['name']))
        
        #print(result_cluster)

        # Write result
        result_all = pd.concat([result_all, pd.DataFrame({'cluster_{}'.format(cluster_id):genes_list, 'result_{}'.format(cluster_id):result_cluster})], axis=1)
    
    return result_all

In [9]:
result_all = annotation_helper(table, database)


Working on cluster:  0
List of genes:  ['Gkn3' 'Snhg11' 'Rtn1' 'Ndn' 'Ly6h']
Number of genes:  5
Found Gkn3 as a + marker for Angiogenesis activation

Working on cluster:  1
List of genes:  ['Tcf7l2' 'Lhx9' 'Igfbpl1' 'Tubb5' 'Sox11']
Number of genes:  5
Found Tcf7l2 as a + marker for Roof plate of Thalamus
Found Tcf7l2 as a + marker for Epithalamus
Found Igfbpl1 as a + marker for Neural Progenitors
Found Tcf7l2 as a - marker for Neuron hindbrain V2b interneurons
Found Tcf7l2 as a + marker for Dorsal Thalamic tier Excitatory type 1
Found Lhx9 as a + marker for Hindbrain Excitatory type 4
Found Lhx9 as a + marker for Dentate gyrus cells
Found Lhx9 as a + marker for Hindbrain C1 Glutamatergic Neuroblast
Found Tcf7l2 as a - marker for Gabaergic Neuroblast of the colliculus
Found Tcf7l2 as a - marker for Neuroblast hindbrain V2b interneurons
Found Tcf7l2 as a + marker for Neuroblast Midbrain type 8
Found Igfbpl1 as a + marker for Neuroblast Medial

Working on cluster:  2
List of genes:  ['

In [10]:
def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

In [11]:
pretty_print(result_all)

Unnamed: 0,cluster_0,result_0,cluster_1,result_1,cluster_2,result_2,cluster_3,result_3,cluster_4,result_4,cluster_5,result_5,cluster_6,result_6,cluster_7,result_7,cluster_8,result_8,cluster_9,result_9,cluster_10,result_10,cluster_11,result_11,cluster_12,result_12,cluster_13,result_13
0,Gkn3,+@Angio,Tcf7l2,+@p2RP +epiTh -IntV2b +ThaEx1 -NbIntCL1 -NbIntV2b +NbMid8,Otx2,-@PNT +@MHBr -@MHBc +NbTG1,Tfap2b,+NbTG3a +NbIntTG2 +NbTG1 +NbTG2 +NbIntTG1 +NbHin5 +NbTG3b,Gm17750,,Igfbpl1,+NProg +NblastM,Sox14,+@DMid +IntV2b +Hypo-VM +NbIntV2b,Tmsb10,,Six3os1,,Vim,+RglMHb +EPN-T +Rgl +RglFb +EPN-Ep,Sparc,,Alas2,+ERY,C1qc,,Kcnj8,
1,Snhg11,,Lhx9,+HinEx4 +HC-DGC +NbGlutHc1,Lmo1,,Malat1,,Ebf1,+NbMSN,Lhx2,+@OXT +@Eye +HinEx3 +NbHin1 +NbGlutHc1,Nfib,+IntMid2,Mllt11,,Pnoc,,Fabp7,,Vim,+RglMHb +EPN-T +Rgl +RglFb +EPN-Ep,Hbb-bs,,C1qa,,Rgs5,+PERI -RglMHb -Rgl -RglFb
2,Rtn1,,Igfbpl1,+NProg +NblastM,H3f3b,,Nxph1,+IntHc1 +IntMid1 +NbIntTG2 +NbIntTG1,Lef1,,Rps11,,Lhx5,+@Hem +CRC,Tuba1a,,Six3,+@Eye -MGabaProg +ANE +IntMid1 +MSN1 +NbRtC,Phgdh,,Itm2a,,Hbb-bt,,Trem2,,Mmp9,
3,Ndn,,Tubb5,,Adarb2,+CGEInt +Glyc-Cblc2 +IntFV1,Gad2,+@GABAGLUT1 +@GABA +Glyc-Cblc2,Lmo3,+NbFor4 +NbMid3,Rpl13a,,Tfap2b,+NbTG3a +NbIntTG2 +NbTG1 +NbTG2 +NbIntTG1 +NbHin5 +NbTG3b,Gap43,,Npy,+@AGRP -IntMid2 +IntMid1 +OB-OEC,Ckb,,Serpinh1,,Hba-a2,+ERY,Cx3cr1,,Ndufa4l2,
4,Ly6h,,Sox11,,Malat1,,Nrxn3,+Glyc-Skor1 +IntHc1,Islr2,+IntHin1 +NbMid3,Rps6,,Gad2,+@GABAGLUT1 +@GABA +Glyc-Cblc2,Nnat,,Uncx,+MB +NbFor1,Plpp3,,Rps27,,Hba-a1,,Pld4,,Cald1,
