# Gene matrix recipe

### 0. Preparation
This step includes import statements, path settings, and function definitions.

In [None]:
# Import statements
import os
import yaml
import numpy as np
import pandas as pd

from collections import defaultdict

In [None]:
# Path settings
project_dir = os.path.dirname(os.path.abspath('.'))
conf_dir = os.path.join(project_dir, 'conf')
path_conf_path = os.path.join(conf_dir, 'filepaths.yaml')
gene_list_conf_path = os.path.join(conf_dir, 'gene_list_names.yaml')
gene_type_conf_path = os.path.join(conf_dir, 'gene_biotype.yaml')

with open(path_conf_path) as path_conf_file:
    path_dict = yaml.safe_load(path_conf_file)
    
with open(gene_list_conf_path) as gene_list_conf_file:
    gene_list_name_dict = yaml.safe_load(gene_list_conf_file)

log_dir = os.path.join(project_dir, path_dict['LOG_DIR'])
os.makedirs(log_dir, exist_ok=True)

gencode_path = os.path.join(project_dir, path_dict['GENCODE'])
prev_gene_mat_path = os.path.join(project_dir, path_dict['An2018'])
hgnc_path = os.path.join(project_dir, path_dict['HGNC'])
alt_gene_list_path = os.path.join(project_dir, path_dict['An2018_ALT_GENE'])

In [None]:
# Load the data
gencode_df = pd.read_table(gencode_path, compression='gzip', comment='#', names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])  
gencode_gene_df = gencode_df[gencode_df['feature'] == 'gene']  # List up only genes
gencode_tx_df = gencode_df[gencode_df['feature'] == 'transcript']  # List up only transcripts
hgnc_df = pd.read_table(hgnc_path, usecols=['hgnc_id', 'symbol', 'alias_symbol', 'prev_symbol', 'ensembl_gene_id', 'ucsc_id', 'refseq_accession'])
prev_gene_mat_df = pd.read_excel(prev_gene_mat_path, sheet_name='8-1 Genesets')

In [None]:
# Function to parse strings of the 'attribute' field in the GENCODE.
def parse_attr_str(attr_str):
    attrs = attr_str.split(';')
    attr_dict = {}
    
    for attr in attrs:
        key, value = attr.split('=')
        attr_dict[key] = value
        
    return attr_dict

In [None]:
# Parse values of the 'attribute' field in the GENCODE
gene_to_attr_dict = {}  # Key: GeneID, Value: A dictionary for the information in the 'attribute' columns
gene_name_to_ids = defaultdict(list)  # Key: Gene name, Value: The list of gene IDs
hgnc_to_ids = defaultdict(list)  # Key: HGNC ID, Value: The list of gene IDs

for attr_str in gencode_gene_df['attribute'].values:
    attr_dict = parse_attr_str(attr_str)
    
    if not attr_dict['ID'].endswith('Y'):  # Ignore pseudoautosome region (PAR_Y)
        gene_id = attr_dict['ID'].split('.')[0]
        gene_to_attr_dict[gene_id] = attr_dict
        gene_name_to_ids[attr_dict['gene_name']].append(gene_id)
        hgnc_id = attr_dict.get('hgnc_id')
        
        if hgnc_id is not None:
            hgnc_to_ids[hgnc_id].append(gene_id)

tx_to_attr_dict = {}  # Key: GeneID, Value: A dictionary for the information in the 'attribute' columns

for attr_str in gencode_tx_df['attribute'].values:
    attr_dict = parse_attr_str(attr_str)
    
    if not attr_dict['ID'].endswith('Y'):  # Ignore pseudoautosome region (PAR_Y)
        tx_id = attr_dict['ID'].split('.')[0]
        tx_to_attr_dict[tx_id] = attr_dict

In [None]:
# Parse the HGNC file and make a dictionary which key and value are a gene symbol and its ensembl gene ID, respectively.
hgnc_df_col_idx = {column: i for i, column in enumerate(hgnc_df.columns.values)}
hgnc_df_val = hgnc_df.values

symbol_to_gene_ids = {}
alias_to_gene_ids = {}
prev_symbol_to_gene_ids = {}

for hgnc_entry in hgnc_df_val:
    gene_symbol = hgnc_entry[hgnc_df_col_idx['symbol']]
    alias_symbol_str = hgnc_entry[hgnc_df_col_idx['alias_symbol']]
    prev_symbol_str = hgnc_entry[hgnc_df_col_idx['prev_symbol']]
    ensembl_gene_id = hgnc_entry[hgnc_df_col_idx['ensembl_gene_id']]
    
    if ensembl_gene_id is np.nan or gene_to_attr_dict.get(ensembl_gene_id) is None:
        gene_ids = None
    else:
        gencode_symbol = gene_to_attr_dict[ensembl_gene_id]['gene_name']
        gene_ids = gene_name_to_ids.get(gencode_symbol)  # From GENCODE v33
    
    if gene_ids is not None: 
        symbol_to_gene_ids[gene_symbol] = gene_ids

        if alias_symbol_str is not np.nan:
            alias_symbols = alias_symbol_str.split('|')

            for alias_symbol in alias_symbols:
                alias_to_gene_ids[alias_symbol] = gene_ids

        if prev_symbol_str is not np.nan:
            prev_symbols = prev_symbol_str.split('|')

            for prev_symbol in prev_symbols:
                prev_symbol_to_gene_ids[prev_symbol] = gene_ids

In [None]:
# Function to find a gene ID from a gene symbol of the HGNC and the GENCODE v33
def find_gene_ids(gene_symbol):
    # Priority: 
    # 1. GENCODE v33
    # 2. HGNC: symbol -> alias -> previous symbol
    # If the gene IDs cannot be found, return None
    
    # GENCODE v33
    gene_ids = gene_name_to_ids.get(gene_symbol)
    
    # HGNC
    if gene_ids is None:
        gene_ids = symbol_to_gene_ids.get(gene_symbol)

        if gene_ids is None:
            gene_ids = alias_to_gene_ids.get(gene_symbol)

            if gene_ids is None:
                gene_ids = prev_symbol_to_gene_ids.get(gene_symbol)

    return gene_ids

### 1. Make a list of alternative gene names and IDs for the deprecated genes of An et al., Science, 2018
The purpose of this step is to find genes **deprecated in the GENCODE v33** from the *8-1 Genesets* sheet of the *Supplementary Table S8* in An *et al.*, *Science*, 2018 (This is a **previous gene matrix**) and to find their alternatives.

*Note: If you alreday have the {alt_gene_list_path} file, skip this step.*

In [None]:
# Here is No. of genes deprecated in the GENCODE v33 
gene_id_set = set(gene_to_attr_dict.keys())  # From the GENCODE v33
prev_gene_ids = np.vectorize(lambda x: x.split('.')[0])(prev_gene_mat_df['EnsemblGeneId'].values)
is_depr_gene = np.vectorize(lambda x: x not in gene_id_set)(prev_gene_ids)
print(f'No. deprecated genes: {sum(is_depr_gene)}')
depr_gene_mat_df = prev_gene_mat_df[is_depr_gene]
depr_gene_mat_df.sum(0)

In [None]:
# The list of deprecated genes with their alternative names and IDs
# The genes was in the gene lists An et al., 2018 except previous GENCODE biotype columns
# Columns: gene, gene_id, alt_gene, alt_gene_id
# Ref: https://www.genecards.org, http://asia.ensembl.org
depr_gene_entries = [
    ['AKAP2', 'ENSG00000241978.9', 'PALM2AKAP2', 'ENSG00000157654.19'],
    ['BTBD8', 'ENSG00000284413.2', 'BTBD8', 'ENSG00000189195.13'],
    ['C2orf48', 'ENSG00000163009.8', 'RRM2', 'ENSG00000171848.15'],
    ['C3orf36', 'ENSG00000221972.3', 'SLCO2A1', 'ENSG00000174640.15'],
    ['C8orf44', 'ENSG00000213865.7', 'SGK3', 'ENSG00000104205.15'],
    ['C9orf47', 'ENSG00000186354.10', 'S1PR3', 'ENSG00000213694.5'],
    ['HIST1H3B', 'ENSG00000274267.1', 'H3C2', 'ENSG00000286522.1'],
    ['HIST1H3C', 'ENSG00000278272.1', 'H3C3', 'ENSG00000287080.1'],
    ['SCO2', 'ENSG00000130489.14', 'SCO2', 'ENSG00000284194.2'],
    ['TBCE', 'ENSG00000116957.12', 'TBCE', 'ENSG00000284770.2'],
    ['TBCE', 'ENSG00000116957.12', 'TBCE', 'ENSG00000285053.1'],
    ['TMEM133', 'ENSG00000170647.3', 'ARHGAP42', 'ENSG00000165895.19'],
    ['PAML2', 'ENSG00000243444.7', 'PALM2AKAP2', 'ENSG00000157654.19'], 
]

# Write the entries
with open(alt_gene_list_path, 'w') as alt_gene_list_file:
    print('gene', 'gene_id', 'alt_gene', 'alt_gene_id', sep='\t', file=alt_gene_list_file)
    
    for gene_entry in depr_gene_entries:
        print(*gene_entry, sep='\t', file=alt_gene_list_file)

### 2. Update the previous gene matrix to be compatible with GENCODE v33
This step replace the deprecated genes of the previous gene matrix with new ones.

In [None]:
# Load the list of deprecated genes from the previous gene matrix 
alt_gene_dict = defaultdict(list)

with open(alt_gene_list_path) as alt_gene_list_file:
    alt_gene_list_file.readline()  # Read the header
    
    for line in alt_gene_list_file:
        fields = line.strip().split('\t')
        prev_gene_id = fields[1].split('.')[0]
        alt_gene_id = fields[3].split('.')[0]
        alt_gene_dict[prev_gene_id].append(alt_gene_id)

# Remove genes in pseudoautosome regions (PAR_Y)
prev_gene_mat_df = prev_gene_mat_df[np.vectorize(lambda x: not x.endswith('_PAR_Y'))(prev_gene_mat_df['EnsemblGeneId'].values)]

# Values from the previous gene matrix
prev_gene_mat_values = prev_gene_mat_df.values
prev_gene_mat_cols = prev_gene_mat_df.columns.values
col_idx_dict = {colname: i for i, colname in enumerate(prev_gene_mat_cols)}

# Get values from the previous gene matrix and store as dictionary
prev_gene_mat_dict = {}  # Key: gene ID, Value: a dictionary (gene list name -> its value (0 or 1))

for prev_gene_vals in prev_gene_mat_values:
    gene_val_dict = {}
    
    # Get the values for each gene list column
    for colname in prev_gene_mat_cols:
        gene_list_name = gene_list_name_dict.get(colname)
        
        if gene_list_name is not None:  # If None, this column is not for gene lists.
            gene_val_dict[gene_list_name] = prev_gene_vals[col_idx_dict[colname]]
    
    prev_gene_id = prev_gene_vals[col_idx_dict['EnsemblGeneId']].split('.')[0]
    alt_gene_ids = alt_gene_dict.get(prev_gene_id, [prev_gene_id])  # Change the gene ID if it was deprecated
    
    # Make up the dictionary for the gene matrix
    for gene_id in alt_gene_ids:
        same_gene_dict = prev_gene_mat_dict.get(gene_id)
        
        # Merge values of duplicated genes using logical OR operation
        if same_gene_dict is not None:
            for gene_list_name in gene_val_dict:
                gene_val_dict[gene_list_name] |= same_gene_dict[gene_list_name]
    
        prev_gene_mat_dict[gene_id] = dict(gene_val_dict)  # To copy deeply

# Make a dictionary for the new gene matrix
new_gene_mat_dict = {}

for gene_id in gene_to_attr_dict:
    gene_val_dict = prev_gene_mat_dict.get(gene_id, dict())
    gene_val_dict['gene_id'] = gene_to_attr_dict[gene_id]['ID']
    gene_val_dict['gene_name'] = gene_to_attr_dict[gene_id]['gene_name']        
    new_gene_mat_dict[gene_id] = gene_val_dict

# Make a DataFrame for the new gene matrix
gene_mat_df = pd.DataFrame.from_dict(new_gene_mat_dict, orient='index')
gene_mat_cols = list(gene_mat_df.columns.values)
gene_mat_cols = gene_mat_cols[-2:] + gene_mat_cols[:-2]
gene_mat_df = gene_mat_df[gene_mat_cols]
gene_mat_df.fillna(0, inplace=True)
gene_mat_df = gene_mat_df.astype({gene_list_col: 'int64' for gene_list_col in gene_mat_cols[2:]})
gene_mat_df.head()

### 3. Add GENCODE biotypes as gene list columns into the gene matrix
Following columns will be added in this step.
1. Protein_coding
2. Long_ncRNA
3. Small_ncRNA
4. Pseudogene
5. IG_TR_Gene

In [None]:
# Add GENCODE biotypes
with open(gene_type_conf_path) as gene_type_conf_file:
    biotype_dict = yaml.safe_load(gene_type_conf_file)

for biotype_category in biotype_dict:
    biotype_set = set(biotype_dict[biotype_category])
    gene_mat_val_dict = {gene_id: 1 if gene_to_attr_dict[gene_id]['gene_type'] in biotype_set else 0 for gene_id in gene_to_attr_dict}
    gene_mat_df[biotype_category] = pd.Series(gene_mat_val_dict)
    
gene_mat_df.head()

### 4. Update new gene lists
The purpose of the following steps is to update new datasets to our gene matrix.

##### 01. 102 ASD genes (Satterstrom et al., Cell, 2020)

In [None]:
# Load the dataset
asd_gene_list_path = os.path.join(project_dir, path_dict['ASD'])
asd_df = pd.read_excel(asd_gene_list_path, sheet_name='102_ASD')
asd_df.dropna(inplace=True)

# Find the 102 ASD genes from the genes of the gene matrix
asd_gene_set = set(asd_df['ensembl_gene_id'].values)
is_asd_gene = np.vectorize(lambda gene_id: 1 if gene_id in asd_gene_set else 0)(gene_mat_df.index.values)

# Update the gene list to the gene matrix
asd_colname = gene_list_name_dict['ASD']
gene_mat_df[asd_colname] = is_asd_gene
gene_mat_df.head()

##### 02. 299 DDD genes

In [None]:
# Load the dataset
ddd_gene_list_path = os.path.join(project_dir, path_dict['DDD'])
ddd_df = pd.read_excel(ddd_gene_list_path, sheet_name='kaplanis_samocha_denovoWEST_res')
ddd_df = ddd_df[ddd_df['significant'] == True]  # Leave only significant genes
ddd_df = ddd_df.astype({'hgnc_id': 'int64'})

# FInd the 299 DDD genes from the genes of the gene matrix
ddd_hgnc_ids = np.vectorize(lambda x: f'HGNC:{x}')(ddd_df['hgnc_id'].values)
ddd_hgnc_id_set = set(ddd_hgnc_ids)
gene_id_to_hgnc_id = {gene_id: gene_to_attr_dict[gene_id].get('hgnc_id') for gene_id in gene_to_attr_dict}  
is_ddd_gene = np.vectorize(lambda gene_id: 1 if gene_id_to_hgnc_id[gene_id] in ddd_hgnc_id_set else 0)(gene_mat_df.index.values)

# Update the gene list to the gene matrix
ddd_colname = gene_list_name_dict['DDD']
gene_mat_df[ddd_colname] = is_ddd_gene
gene_mat_df.head()

##### 03. Haploinsufficient genes

In [None]:
# load the dataset
gnomad_gene_list_path = os.path.join(project_dir, path_dict['GNOMAD_GENE'])
gnomad_gene_df = pd.read_table(gnomad_gene_list_path)

# Extract HI genes (HI: Haploinsufficient)
is_hi_gene_func = lambda pli: pli is not np.nan and pli >= 0.9
is_hi_gene = np.vectorize(is_hi_gene_func)(gnomad_gene_df['pLI'].values)
hi_gene_df = gnomad_gene_df[is_hi_gene]  

# Dictionaries for pLI scores
tx_to_pli = {}
symbol_to_pli = {}
gene_to_pli = {}

# Update the 'tx_to_pli' and the 'symbol_to_pli'
hi_txs = hi_gene_df['transcript'].values
hi_symbols = hi_gene_df['gene'].values
hi_plis = hi_gene_df['pLI'].values

for tx_id, symbol, pli_score in zip(hi_txs, hi_symbols, hi_plis):
    tx_to_pli[tx_id] = pli_score
    prev_symbol_pli = symbol_to_pli.get(symbol)
    
    # Choose the maximum pLI score for duplicated symbols.
    if prev_symbol_pli is None or prev_symbol_pli < pli_score:  
        symbol_to_pli[symbol] = pli_score

# There are 3 steps to update the 'gene_to_pli'
# Step 1: Update the 'gene_to_pli' dictionary by getting gene IDs of the transcript IDs via the GENCODE
depr_tx_set = set()
hi_tx_list = hi_gene_df['transcript'].values

for tx_id in hi_tx_list:
    attr_dict = tx_to_attr_dict.get(tx_id)
    
    if attr_dict is None:
        depr_tx_set.add(tx_id)
    else:
        gene_id = attr_dict['Parent'].split('.')[0]
        prev_gene_pli = gene_to_pli.get(gene_id)
        pli_score = tx_to_pli[tx_id]
        
        # Choose the maximum pLI score for duplicated genes.
        if prev_gene_pli is None or prev_gene_pli < pli_score:    
            gene_to_pli[gene_id] = pli_score

# Step 2: Update the 'gene_to_pli' dictionary by getting gene IDs of the deprecated transcript IDs using 
depr_hi_gene_df = hi_gene_df[np.vectorize(lambda tx_id: tx_id in depr_tx_set)(hi_txs)]
depr_gene_symbols = depr_hi_gene_df['gene'].values

for depr_gene_symbol in depr_gene_symbols:
    gene_ids = find_gene_ids(depr_gene_symbol)
    
    if gene_ids is None:
        print(f'{depr_gene_symbol} cannot be found in both the GENCODE and the HGNC.')
        continue
    
    # Update only if the same gene ID does not exist
    for gene_id in gene_ids:
        if gene_to_pli.get(gene_id) is None:  
            gene_to_pli[gene_id] = symbol_to_pli[depr_gene_symbol]

# Update the gene matrix
# HC: High-confident
is_hi_gene = np.vectorize(lambda gene_id: 0 if gene_to_pli.get(gene_id) is None else 1)(gene_mat_df.index.values)  # pLI score >= 0.9
is_hc_hi_gene = np.vectorize(lambda gene_id: 1 if gene_to_pli.get(gene_id) is not None and gene_to_pli.get(gene_id) >= 0.995 else 0)(gene_mat_df.index.values)  # pLI score >= 0.995
gene_mat_df[gene_list_name_dict['GNOMAD_PLI90']] = is_hi_gene
gene_mat_df[gene_list_name_dict['GNOMAD_PLI995']] = is_hc_hi_gene
gene_mat_df.head()

##### 04. Co-expression network analysis modules associated with ASD (Parikshak et al., Nature, 2016) 
- Genome wide transcriptome analysis of 251 post-mortem samples of frontal and temporal cortex and cerebellum from 48 individuals with ASD and 49 control subjects
- Identified 6 modules significantly associated with ASD of 24 modules from WGCNA in the cortex analysis
- Upregulated: CTX.M9, CTX.M19, CTX.M20
- Downregulated: CTX.M4, CTX.M10, CTX.M16

In [None]:
# Load the dataset
gene_list_key = 'ASD_WGCNA'
mat_col_prefix = gene_list_name_dict[gene_list_key]
parik_table_path = os.path.join(project_dir, path_dict[gene_list_key])
parik_gene_df = pd.read_excel(parik_table_path, sheet_name='TableS2a', header=1)

# Split the genes by their modules
groupby_module = parik_gene_df.groupby('WGCNA Module Label')
asd_module_nums = [4, 9, 10, 16, 19, 20]  # Modules significantly associated with ASD
asd_module_dfs = [groupby_module.get_group(asd_module_num) for asd_module_num in asd_module_nums]

# Update the gene matrix
gencode_gene_ids = gene_mat_df.index.values
gencode_gene_id_set = set(gencode_gene_ids)
na_gene_dict = defaultdict(list)

for i, asd_module_df in enumerate(asd_module_dfs):
    module_num = asd_module_nums[i]
    module_name = f'CTX.M{module_num}'
    mat_col_name = f"{mat_col_prefix}_{module_name}"
    mod_gene_cnt = len(asd_module_df.index.values)
    
    # Make a set of gene IDs in the module
    mod_gene_ids = asd_module_df['ENSEMBL ID'].values
    mod_gene_id_set = set()
    mod_gene_symbols = asd_module_df['HGNC Symbol'].values
    na_gene_cnt = 0
    
    for i in range(mod_gene_cnt):
        mod_gene_id = mod_gene_ids[i]
        mod_gene_symbol = mod_gene_symbols[i]
        
        if mod_gene_id in gencode_gene_id_set:
            mod_gene_id_set.add(mod_gene_id)
        else:  # Replace with alternative gene IDs
            if mod_gene_symbol is np.nan:  # New mod_gene ID cannot be found.
                na_gene_dict[mat_col_name].append(mod_gene_id)
                na_gene_cnt += 1
            else:
                alt_mod_gene_ids = find_gene_ids(mod_gene_symbol)
                
                for alt_mod_gene_id in alt_mod_gene_ids:
                    mod_gene_id_set.add(alt_mod_gene_id)
    
    # Update
    gene_mat_vals = np.vectorize(lambda gene_id: 1 if gene_id in mod_gene_id_set else 0)(gencode_gene_ids)
    gene_mat_df[mat_col_name] = gene_mat_vals
    print(f'[{mat_col_name}] No. all genes: {mod_gene_cnt}, No. not available genes: {na_gene_cnt}')

# Save the dictionary for not available genes as a log
log_path_key = f'{gene_list_key}_LOG'
log_path = os.path.join(project_dir, path_dict[log_path_key])

with open(log_path, 'w') as logfile:
    yaml.dump(dict(na_gene_dict), logfile, default_flow_style=False)

gene_mat_df.head()

##### 05. Nowakowski et al., Science, 2017
- scRNA-seq for primary cortical and medial ganglionic eminence (MGE) in developing human telencephalon
- From 48 samples with 5.85 ~ 37 PCW
- Unbiased clustering and found marker genes for each cluster

In [None]:
# Load the dataset
gene_list_key = 'SC_Now2017'
now_table_path = os.path.join(project_dir, path_dict[gene_list_key])
now_gene_df = pd.read_excel(now_table_path, sheet_name='Table5 - Clustermarkers')

# Split the genes by their modules
mat_col_prefix = gene_list_name_dict[gene_list_key]
groupby_cluster = now_gene_df.groupby('cluster')
cluster_names = list(groupby_cluster.groups.keys())
gene_cluster_dfs = [groupby_cluster.get_group(cluster_name) for cluster_name in cluster_names]

# Update the gene matrix
mat_indices = gene_mat_df.index.values
mat_index_set = set(mat_indices)
na_gene_dict = defaultdict(list)

for i, gene_cluster_df in enumerate(gene_cluster_dfs):
    cluster_name = cluster_names[i]
    mat_col_name = f'{mat_col_prefix}_{cluster_name}'
    marker_genes = gene_cluster_df['gene'].values
    marker_gene_cnt = len(marker_genes)
    
    # Make a set of marker gene IDs
    marker_gene_id_set = set()
    na_gene_cnt = 0
    
    for marker_gene in marker_genes:
        marker_gene_ids = find_gene_ids(marker_gene)
        
        if marker_gene_ids is None:
            na_gene_dict[mat_col_name].append(marker_gene)
            na_gene_cnt += 1
        else:
            for marker_gene_id in marker_gene_ids:
                marker_gene_id_set.add(marker_gene_id)
    
    # Update
    gene_mat_vals = np.vectorize(lambda gene_id: 1 if gene_id in marker_gene_id_set else 0)(mat_indices)
    gene_mat_df[mat_col_name] = gene_mat_vals 
    print(f'[{mat_col_name}] No. all genes: {marker_gene_cnt}, No. not available genes: {na_gene_cnt}')

# Save the dictionary for not available genes as a log
log_path_key = f'{gene_list_key}_LOG'
log_path = os.path.join(project_dir, path_dict[log_path_key])

with open(log_path, 'w') as logfile:
    yaml.dump(dict(na_gene_dict), logfile, default_flow_style=False)

gene_mat_df.head()

##### 06. Li et al., Science, 2018
- Single-cell from 9 de-identified postmortem brains (Table S3)
- 5 PCW ~ 20 PCW
- Marker genes for each cell type of Pallium, CP, DFC, NCX regions in a human brain

In [None]:
# Load the dataset
gene_list_key = 'SC_Li2018'
li_table_path = os.path.join(project_dir, path_dict[gene_list_key])
li_gene_df = pd.read_excel(li_table_path, sheet_name='Table S8', header=3)

# Split the dataframe into two DataFrames and concatenate them
prenatal_gene_df = li_gene_df[['Gene symbol', 'Cell type', 'Age']].copy()
adult_gene_df = li_gene_df[['Gene symbol.1', 'Cell type.1', 'Age.1']].copy()
adult_gene_df.rename(columns={colname: colname.split('.')[0] for colname in adult_gene_df.columns.values}, inplace=True)
adult_gene_df.dropna(inplace=True)
li_gene_df = pd.concat([prenatal_gene_df, adult_gene_df], ignore_index=True)

# Split the genes by their cell types
mat_col_prefix = gene_list_name_dict[gene_list_key]
groupby_cell = li_gene_df.groupby('Cell type')
cell_names = list(groupby_cell.groups.keys())
cell_gene_dfs = [groupby_cell.get_group(cell_name) for cell_name in cell_names]

# Update the gene matrix
mat_indices = gene_mat_df.index.values
na_gene_dict = defaultdict(list)

for i, cell_gene_df in enumerate(cell_gene_dfs):
    cell_name = cell_names[i]
    mat_col_name = f'{mat_col_prefix}_{cell_name}'
    marker_genes = cell_gene_df['Gene symbol'].values
    marker_gene_cnt = len(marker_genes)
    
    # Make a set of marker genes
    marker_gene_id_set = set()
    na_gene_cnt = 0
    
    for marker_gene in marker_genes:
        marker_gene_ids = find_gene_ids(marker_gene)
        
        if marker_gene_ids is None:
            na_gene_dict[mat_col_name].append(marker_gene)
            na_gene_cnt += 1
        else:
            for marker_gene_id in marker_gene_ids:
                marker_gene_id_set.add(marker_gene_id)
    
    # Update
    gene_mat_vals = np.vectorize(lambda gene_id: 1 if gene_id in marker_gene_id_set else 0)(mat_indices)
    gene_mat_df[mat_col_name] = gene_mat_vals 
    print(f'[{mat_col_name}] No. all genes: {marker_gene_cnt}, No. not available genes: {na_gene_cnt}')

# Save the dictionary for not available genes as a log
log_path_key = f'{gene_list_key}_LOG'
log_path = os.path.join(project_dir, path_dict[log_path_key])

with open(log_path, 'w') as logfile:
    yaml.dump(dict(na_gene_dict), logfile, default_flow_style=False)

gene_mat_df.head()

##### 07. Velmeshev et al., Science, 2019
- Single-nucleus sequencing 
- 41 post-mortem tissue samples including prefrontal cortex (PFC) and anterior cingulate cortex (ACC) from 16 controls and 15 ASD patients.

In [None]:
# Load the dataset
gene_list_key = 'SN_Vel2019'
vel_table_path = os.path.join(project_dir, path_dict[gene_list_key])
vel_gene_df = pd.read_excel(vel_table_path, sheet_name='cell type markers')
vel_gene_df.head()

# Split the genes by their modules
mat_col_prefix = gene_list_name_dict[gene_list_key]
groupby_cell = vel_gene_df.groupby('Cell type')
cell_names = list(groupby_cell.groups.keys())
cell_gene_dfs = [groupby_cell.get_group(cell_name) for cell_name in cell_names]

# Update the gene matrix
mat_indices = gene_mat_df.index.values
mat_index_set = set(mat_indices)
na_gene_dict = defaultdict(list)

for i, cell_gene_df in enumerate(cell_gene_dfs):
    cell_name = cell_names[i]
    mat_col_name = f'{mat_col_prefix}_{cell_name}'
    marker_gene_ids = cell_gene_df['Gene ID'].values
    marker_gene_names = cell_gene_df['Gene name'].values
    marker_gene_cnt = len(marker_genes)
    
    # Make a set of marker genes
    marker_gene_id_set = set()
    na_gene_cnt = 0
    
    for marker_gene_id, marker_gene_name in zip(marker_gene_ids, marker_gene_names):
        if marker_gene_id in mat_index_set:
            marker_gene_id_set.add(marker_gene_id)
        else:
            alt_gene_ids = find_gene_ids(marker_gene_name)
            
            if alt_gene_ids is None:
                na_gene_dict[mat_col_name].append(marker_gene_id)
                na_gene_cnt += 1
            else:
                for alt_gene_id in alt_gene_ids:
                    marker_gene_id_set.add(alt_gene_id)
                    
    gene_mat_vals = np.vectorize(lambda gene_id: 1 if gene_id in marker_gene_id_set else 0)(mat_indices)
    gene_mat_df[mat_col_name] = gene_mat_vals            
    print(f'[{mat_col_name}] No. all genes: {marker_gene_cnt}, No. not available genes: {na_gene_cnt}')

# Save the dictionary for not available genes as a log
log_path_key = f'{gene_list_key}_LOG'
log_path = os.path.join(project_dir, path_dict[log_path_key])

with open(log_path, 'w') as logfile:
    yaml.dump(dict(na_gene_dict), logfile, default_flow_style=False)
    
gene_mat_df.head()

### 5. Save the gene matrix as a text file

In [None]:
# Write your own file path in here
my_gene_mat_path = os.path.join(project_dir, 'my_gene_matrix.txt')
gene_mat_df.to_csv(my_gene_mat_path, sep='\t', index=False)